Unity 使用百度语音进行语音识别
发表于2018-02-11
如何在开发过程中使用百度语音进行语音识别?只需要接入百度语音的API接口即可。
新建脚本,将下列代码复制进去:
using LitJson; using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Text; using UnityEngine; using UnityEngine.UI; public class ToWord : MonoBehaviour { private string token= ""; //access_token private string cuid = "11"; //用户标识 private string format = "wav"; //语音格式 private int rate = 8000; //采样率 private int channel = 1; //声道数 private string speech; //语音数据,进行base64编码 private int len; //原始语音长度 private string lan = "zh"; //语种 private string grant_Type = "client_credentials"; private string client_ID = "这里输入百度的appkey,自己到官网申请填入这里"; //百度appkey private string client_Secret = "这里输入百度secretkey,自己到官网申请填写"; //百度Secret Key private string baiduAPI = "http://vop.baidu.com/server_api"; private string getTokenAPIPath = "https://openapi.baidu.com/oauth/2.0/token"; private byte[] clipByte; /// <summary> /// 转换出来的TEXT /// </summary> public static string audioToString; public AudioSource aud; private int audioLength;//录音的长度 public delegate void CallBack(string name); public delegate string ds(); private static ToWord _toWord; public MicroPhoneManager m; private void Awake() { _toWord = this; StartCoroutine(GetToken(getTokenAPIPath)); } public static ToWord GetInstance() { return _toWord; } /// <summary> /// 获取百度用户令牌 /// </summary> /// <param name="url">获取的url</param> /// <returns></returns> private IEnumerator GetToken(string url) { WWWForm getTForm = new WWWForm(); getTForm.AddField("grant_type", grant_Type); getTForm.AddField("client_id", client_ID); getTForm.AddField("client_secret", client_Secret); WWW getTW = new WWW(url, getTForm); yield return getTW; if (getTW.isDone) { if (getTW.error == null) { token = JsonMapper.ToObject(getTW.text)["access_token"].ToString(); Debug.Log("获取百度用户令牌 初始化完成"); } else Debug.Log("error:" + getTW.error); } } /// <summary> /// 开始录音 /// </summary> public void StartMic(int durationTime) { if (Microphone.devices.Length == 0) return; Microphone.End(null); Debug.Log("Start"); aud.clip = Microphone.Start(null, false, durationTime, rate); } /// <summary> /// 结束录音 /// </summary> public void EndMic(CallBack cb,BtnInfo info) { int lastPos = Microphone.GetPosition(null); if (Microphone.IsRecording(null)) audioLength = lastPos / rate;//录音时长 else audioLength = 10; Debug.Log("录音结束"); Microphone.End(null); clipByte = GetClipData(); len = clipByte.Length; speech = Convert.ToBase64String(clipByte); using (FileStream fs = CreateEmpty(Utils.GetAudioDataPath() + "/" + info.ID + "_1.wav")) { ConvertAndWrite(fs, aud.clip); WriteHeader(fs, aud.clip); Debug.Log("保存成功"); } StartCoroutine(GetAudioString(baiduAPI, cb)); } void aaa(string str) { } private void WriteHeader(FileStream stream, AudioClip clip) { int hz = clip.frequency; int channels = clip.channels; int samples = clip.samples; stream.Seek(0, SeekOrigin.Begin); Byte[] riff = System.Text.Encoding.UTF8.GetBytes("RIFF"); stream.Write(riff, 0, 4); Byte[] chunkSize = BitConverter.GetBytes(stream.Length - 8); stream.Write(chunkSize, 0, 4); Byte[] wave = System.Text.Encoding.UTF8.GetBytes("WAVE"); stream.Write(wave, 0, 4); Byte[] fmt = System.Text.Encoding.UTF8.GetBytes("fmt "); stream.Write(fmt, 0, 4); Byte[] subChunk1 = BitConverter.GetBytes(16); stream.Write(subChunk1, 0, 4); UInt16 two = 2; UInt16 one = 1; Byte[] audioFormat = BitConverter.GetBytes(one); stream.Write(audioFormat, 0, 2); Byte[] numChannels = BitConverter.GetBytes(channels); stream.Write(numChannels, 0, 2); Byte[] sampleRate = BitConverter.GetBytes(hz); stream.Write(sampleRate, 0, 4); Byte[] byteRate = BitConverter.GetBytes(hz * channels * 2); // sampleRate * bytesPerSample*number of channels, here 44100*2*2 stream.Write(byteRate, 0, 4); UInt16 blockAlign = (ushort)(channels * 2); stream.Write(BitConverter.GetBytes(blockAlign), 0, 2); UInt16 bps = 16; Byte[] bitsPerSample = BitConverter.GetBytes(bps); stream.Write(bitsPerSample, 0, 2); Byte[] datastring = System.Text.Encoding.UTF8.GetBytes("data"); stream.Write(datastring, 0, 4); Byte[] subChunk2 = BitConverter.GetBytes(samples * channels * 2); stream.Write(subChunk2, 0, 4); } private FileStream CreateEmpty(string filepath) { FileStream fileStream = new FileStream(filepath, FileMode.Create); byte emptyByte = new byte(); for (int i = 0; i < 44; i++) //preparing the header { fileStream.WriteByte(emptyByte); } return fileStream; } private void ConvertAndWrite(FileStream fileStream, AudioClip clip) { float[] samples = new float[clip.samples]; //float[] samples = new float[(int)CurAudioSource.time + 1]; clip.GetData(samples, 0); Int16[] intData = new Int16[samples.Length]; Byte[] bytesData = new Byte[samples.Length * 2]; int rescaleFactor = 32767; //to convert float to Int16 for (int i = 0; i < samples.Length; i++) { intData[i] = (short)(samples[i] * rescaleFactor); Byte[] byteArr = new Byte[2]; byteArr = BitConverter.GetBytes(intData[i]); byteArr.CopyTo(bytesData, i * 2); } fileStream.Write(bytesData, 0, bytesData.Length); } /// <summary> /// 把语音转换为文字 /// </summary> /// <param name="url"></param> /// <returns></returns> private IEnumerator GetAudioString(string url, CallBack cb) { JsonWriter jw = new JsonWriter(); jw.WriteObjectStart(); jw.WritePropertyName("format"); jw.Write(format); jw.WritePropertyName("rate"); jw.Write(rate); jw.WritePropertyName("channel"); jw.Write(channel); jw.WritePropertyName("token"); jw.Write(token); jw.WritePropertyName("cuid"); jw.Write(cuid); jw.WritePropertyName("len"); jw.Write(len); jw.WritePropertyName("speech"); jw.Write(speech); jw.WriteObjectEnd(); WWWForm w = new WWWForm(); WWW getASW = new WWW(url, Encoding.Default.GetBytes(jw.ToString())); yield return getASW; if (getASW.isDone) { if (getASW.error == null) { JsonData getASWJson = JsonMapper.ToObject(getASW.text); if (getASWJson["err_msg"].ToString() == "success.") { audioToString = getASWJson["result"][0].ToString(); if (audioToString.Substring(audioToString.Length - 1) == ",") audioToString = audioToString.Substring(0, audioToString.Length - 1); } } else { //Debug.LogError(getASW.error); audioToString = ""; Debug.Log("error:" + getASW.error); } Debug.Log("此次语音文字为:" + audioToString); if (cb != null) { cb(audioToString); } } } /// <summary> /// 把录音转换为Byte[] /// </summary> /// <returns></returns> public byte[] GetClipData() { if (aud.clip == null) { //Debug.LogError("录音数据为空"); Debug.Log("录音数据为空"); return null; } float[] samples = new float[aud.clip.samples]; aud.clip.GetData(samples, 0); byte[] outData = new byte[samples.Length * 2]; int rescaleFactor = 32767; //to convert float to Int16 for (int i = 0; i < samples.Length; i++) { short temshort = (short)(samples[i] * rescaleFactor); byte[] temdata = System.BitConverter.GetBytes(temshort); outData[i * 2] = temdata[0]; outData[i * 2 + 1] = temdata[1]; } if (outData == null || outData.Length <= 0) { //Debug.LogError("录音数据为空"); Debug.Log("录音数据为空"); return null; } //return SubByte(outData, 0, audioLength * 8000 * 2); return outData; } void Start () { } private void OnGUI() { if (GUILayout.Button("Start")) StartMic(); if (GUILayout.Button("End")) EndMic(null); } public Text debugText; private void Update() { debugText.text = audioToString; } }