Unity对接科大讯飞实时语音转写WebAPI(Windows平台)(一)
cheng219101 2024-07-08 16:03:02 阅读 75
科大讯飞官方文档:实时语音转写 API 文档 | 讯飞开放平台文档中心 (xfyun.cn)
参考文章:unity通过WebAPI连接Websocket实现讯飞语音识别与合成。_unity websocket audio-CSDN博客
要实现语音转文字。首先我们需要从麦克风获取到语音数据,这里用到了Microphone类,Unity自带;其次,需要将语音数据发送给讯飞,这里用到的是WebSocketSharp.WebSocket,用习惯了。然后就是按照文档一步步踩坑了。
直接贴代码了。代码主要实现握手阶段参数签名,实时通信阶段的数据传输以及结果解析。
<code>using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System;
using WebSocketSharp;
using System.Text;
using System.Security.Cryptography;
using LitJson;
using Newtonsoft.Json;
public class SpeechHelper : MonoBehaviour
{
public event Action<string> 语音识别完成事件; //语音识别回调事件
public AudioClip RecordedClip;
private string micphoneName = string.Empty;
WebSocket speechWebSocket;
private System.Action<string> resultCallback;
public void InitSpeechHelper(System.Action<string> textCallback)
{
resultCallback = textCallback;
}
public void StartSpeech()
{
if (speechWebSocket != null && speechWebSocket.ReadyState == WebSocketState.Open)
{
Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");
return;
}
if(Microphone.devices.Length <= 0)
{
Debug.LogWarning("找不到麦克风");
return;
}
messageQueue.Clear();
micphoneName = Microphone.devices[0];
Debug.Log("micphoneName:" + micphoneName);
try
{
RecordedClip = Microphone.Start(micphoneName, false, 60, 16000);
ConnectSpeechWebSocket();
}
catch(Exception ex)
{
Debug.LogError(ex.Message);
}
}
public void StopSpeech()
{
Microphone.End(micphoneName);
Debug.Log("识别结束,停止录音");
}
void ConnectSpeechWebSocket()
{
try
{
speechWebSocket = new WebSocket(GetWebSocketUrl());
}
catch (Exception ex)
{
UnityEngine.Debug.LogError(ex.Message);
return;
}
speechWebSocket.OnOpen += (sender, e) =>
{
Debug.Log("OnOpen");
speechWebSocket.OnClose += OnWebSocketClose;
};
speechWebSocket.OnMessage += OnInitMessage;
speechWebSocket.OnError += OnError;
speechWebSocket.ConnectAsync();
StartCoroutine(SendVoiceData());
}
void OnWebSocketClose(object sender, CloseEventArgs e)
{
Debug.Log("OnWebSocketClose");
}
private static Queue<string> messageQueue = new Queue<string>();
void OnInitMessage(object sender, MessageEventArgs e)
{
UnityEngine.Debug.Log("qqqqqqqqqqqqqWebSocket数据返回:" + e.Data);
messageQueue.Enqueue(e.Data);
}
private void MainThreadOnMessage(string message)
{
try
{
XFResponse response = JsonConvert.DeserializeObject<XFResponse>(message);
if (0 != response.code)
{
return;
}
if (response.action.Equals("result"))
{
var result = ParseXunfeiRecognitionResult(response.data);
if(result.IsFinal)
{
Debug.Log("Text最终:" + result.Text);
resultCallback?.Invoke(result.Text);
}else
{
Debug.Log("Text中间:" + result.Text);
}
}
}
catch (Exception ex)
{
Debug.LogError(ex.Message);
}
}
void OnError(object sender, ErrorEventArgs e)
{
UnityEngine.Debug.Log("WebSoclet:发生错误:" + e.Message);
}
public SpeechRecognitionResult ParseXunfeiRecognitionResult(string dataJson)
{
StringBuilder builder = new StringBuilder();
SpeechRecognitionResult res = new SpeechRecognitionResult();
try
{
JsonData data = JsonMapper.ToObject(dataJson);
JsonData cn = data["cn"];
JsonData st = cn["st"];
if (st["ed"].ToString().Equals("0"))
{
res.IsFinal = false;
}
else
{
res.IsFinal = true;
}
JsonData rtArry = st["rt"];
foreach (JsonData rtObject in rtArry)
{
JsonData wsArr = rtObject["ws"];
foreach (JsonData wsObject in wsArr)
{
JsonData cwArr = wsObject["cw"];
foreach (JsonData cwObject in cwArr)
{
builder.Append(cwObject["w"].ToString());
}
}
}
}catch(Exception ex)
{
Debug.LogError(ex.Message);
}
res.Text = builder.ToString();
return res;
}
void SendData(byte[] voiceData)
{
Debug.Log("SendData:" + voiceData.Length + ",time:" + Time.realtimeSinceStartup);
if (speechWebSocket.ReadyState != WebSocketState.Open)
{
return;
}
try
{
if (speechWebSocket != null && speechWebSocket.IsAlive)
{
speechWebSocket.SendAsync(voiceData, success =>
{
if (success)
{
UnityEngine.Debug.Log("WebSoclet:发送成功:" + voiceData.Length);
}
else
{
UnityEngine.Debug.Log("WebSoclet:发送失败:");
}
});
}
}
catch
{
}
}
void SendEndMsg(System.Action callback)
{
string endMsg = "{\"end\": true}";
byte[] data = Encoding.UTF8.GetBytes(endMsg);
try
{
if (speechWebSocket != null && speechWebSocket.IsAlive)
{
speechWebSocket.SendAsync(data, success =>
{
if (success)
{
UnityEngine.Debug.Log("WebSoclet:发送END成功:" + data.Length);
}
else
{
UnityEngine.Debug.Log("WebSoclet:发送END失败:");
}
callback?.Invoke();
});
}
}
catch
{
}
}
IEnumerator SendVoiceData()
{
yield return new WaitUntil(()=> (speechWebSocket.ReadyState == WebSocketState.Open));
yield return new WaitWhile(() => Microphone.GetPosition(micphoneName) <= 0);
float t = 0;
int position = Microphone.GetPosition(micphoneName);
const float waitTime = 0.04f;//每隔40ms发送音频
int lastPosition = 0;
const int Maxlength = 640;//最大发送长度
//Debug.Log("position:" + position + ",samples:" + RecordedClip.samples);
while (position < RecordedClip.samples && speechWebSocket.ReadyState == WebSocketState.Open)
{
t += waitTime;
yield return new WaitForSecondsRealtime(waitTime);
if (Microphone.IsRecording(micphoneName)) position = Microphone.GetPosition(micphoneName);
//Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);
if (position <= lastPosition)
{
Debug.LogWarning("字节流发送完毕!强制结束!");code>
break;
}
int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;
byte[] date = GetClipData(lastPosition, length, RecordedClip);
SendData(date);
lastPosition = lastPosition + length;
}
yield return new WaitForSecondsRealtime(waitTime);
SendEndMsg(null);
Microphone.End(micphoneName);
}
public byte[] GetClipData(int star, int length, AudioClip recordedClip)
{
float[] soundata = new float[length];
recordedClip.GetData(soundata, star);
int rescaleFactor = 32767;
byte[] outData = new byte[soundata.Length * 2];
for (int i = 0; i < soundata.Length; i++)
{
short temshort = (short)(soundata[i] * rescaleFactor);
byte[] temdata = BitConverter.GetBytes(temshort);
outData[i * 2] = temdata[0];
outData[i * 2 + 1] = temdata[1];
}
return outData;
}
private string GetWebSocketUrl()
{
string appid = "appid";
string ts = GetCurrentUnixTimestampMillis().ToString();
string baseString = appid + ts;
string md5 = GetMD5Hash(baseString);
UnityEngine.Debug.Log("baseString:" + baseString + ",md5:" + md5);
string sha1 = CalculateHmacSha1(md5, "appkey");
string signa = sha1;
string url = string.Format("ws://rtasr.xfyun.cn/v1/ws?appid={0}&ts={1}&signa={2}", appid, ts, signa);
UnityEngine.Debug.Log(url);
return url;
}
private long GetCurrentUnixTimestampMillis()
{
DateTime unixStartTime = new DateTime(1970, 1, 1).ToLocalTime();
DateTime now = DateTime.Now;// DateTime.UtcNow;
TimeSpan timeSpan = now - unixStartTime;
long timestamp = (long)timeSpan.TotalSeconds;
return timestamp;
}
public string GetMD5Hash(string input)
{
MD5 md5Hasher = MD5.Create();
byte[] data = md5Hasher.ComputeHash(Encoding.Default.GetBytes(input));
StringBuilder sBuilder = new StringBuilder();
for (int i = 0; i < data.Length; i++)
{
sBuilder.Append(data[i].ToString("x2"));
}
return sBuilder.ToString();
}
public string CalculateHmacSha1(string data, string key)
{
HMACSHA1 hmac = new HMACSHA1(Encoding.UTF8.GetBytes(key));
byte[] hashBytes = hmac.ComputeHash(Encoding.UTF8.GetBytes(data));
return Convert.ToBase64String(hashBytes);
}
private void Update()
{
if(messageQueue.Count > 0)
{
MainThreadOnMessage(messageQueue.Dequeue());
}
}
}
Json解析类。
[Serializable]
public struct XFResponse
{
public string action;
public int code;
public string data;
public string desc;
public string sid;
}
[Serializable]
public struct SpeechRecognitionResult
{
public string Text;
public bool IsFinal;
}
值得注意的问题。
1、Microphone使用时传默认设备名比传null好使
2、握手阶段时间戳用的是秒(不是毫秒)
3、上传结束标志时,也要间隔40ms,否则讯飞像是没收到一样
4、如果Microphone.devices的长度为0,电脑确实又有麦克风设备,那么可能是麦克风的名字是中文导致的
遗留问题:
yield return new WaitForSecondsRealtime(0.04f)实际间隔时间0.1s左右,导致消息发送得很慢,语音识别慢。
2024.5.24更新第二篇,有效解决消息发送慢,识别慢的问题
2024.6.19更新:取消协程中发送数据,直接在Update中发送。解决消息发送很慢问题
private void Update()
{
if (isRunning)
{
byte[] voiceData = GetVoiveData();
if (voiceData != null)
{
SendData(voiceData);
}
}
if (messageQueue.Count > 0)
{
MainThreadOnMessage(messageQueue.Dequeue());
}
}
private int last_length = -1;
private float[] volumeData = new float[9999];
private short[] intData = new short[9999];
bool isRunning;
private byte[] GetVoiveData()
{
if (RecordedClip == null)
{
return null;
}
int new_length = Microphone.GetPosition(null);
if (new_length == last_length)
{
if (Microphone.devices.Length == 0)
{
isRunning = false;
}
return null;
}
int length = new_length - last_length;
int offset = last_length + 1;
last_length = new_length;
if (offset < 0)
{
return null;
}
if (length < 0)
{
float[] temp = new float[RecordedClip.samples];
RecordedClip.GetData(temp, 0);
int lengthTail = RecordedClip.samples - offset;
int lengthHead = new_length + 1;
try
{
Array.Copy(temp, offset, volumeData, 0, lengthTail);
Array.Copy(temp, 0, volumeData, lengthTail + 1, lengthHead);
length = lengthTail + lengthHead;
}
catch (Exception)
{
return null;
}
}
else
{
if (length > volumeData.Length)
{
volumeData = new float[length];
intData = new short[length];
}
RecordedClip.GetData(volumeData, offset);
}
byte[] bytesData = new byte[length * 2];
int rescaleFactor = 32767; //to convert float to Int16
for (int i = 0; i < length; i++)
{
intData[i] = (short)(volumeData[i] * rescaleFactor);
byte[] byteArr = BitConverter.GetBytes(intData[i]);
byteArr.CopyTo(bytesData, i * 2);
}
return bytesData;
}
声明
本文内容仅代表作者观点,或转载于其他网站,本站不以此文作为商业用途
如有涉及侵权,请联系本站进行删除
转载本站原创文章,请注明来源及作者。