Unity对接科大讯飞实时语音转写WebAPI(Windows平台)(一)

cheng219101 2024-07-08 16:03:02 阅读 75

科大讯飞官方文档:实时语音转写 API 文档 | 讯飞开放平台文档中心 (xfyun.cn)

参考文章:unity通过WebAPI连接Websocket实现讯飞语音识别与合成。_unity websocket audio-CSDN博客

        要实现语音转文字。首先我们需要从麦克风获取到语音数据,这里用到了Microphone类,Unity自带;其次,需要将语音数据发送给讯飞,这里用到的是WebSocketSharp.WebSocket,用习惯了。然后就是按照文档一步步踩坑了。

        直接贴代码了。代码主要实现握手阶段参数签名,实时通信阶段的数据传输以及结果解析。

<code>using System.Collections;

using System.Collections.Generic;

using UnityEngine;

using System;

using WebSocketSharp;

using System.Text;

using System.Security.Cryptography;

using LitJson;

using Newtonsoft.Json;

public class SpeechHelper : MonoBehaviour

{

public event Action<string> 语音识别完成事件; //语音识别回调事件

public AudioClip RecordedClip;

private string micphoneName = string.Empty;

WebSocket speechWebSocket;

private System.Action<string> resultCallback;

public void InitSpeechHelper(System.Action<string> textCallback)

{

resultCallback = textCallback;

}

public void StartSpeech()

{

if (speechWebSocket != null && speechWebSocket.ReadyState == WebSocketState.Open)

{

Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");

return;

}

if(Microphone.devices.Length <= 0)

{

Debug.LogWarning("找不到麦克风");

return;

}

messageQueue.Clear();

micphoneName = Microphone.devices[0];

Debug.Log("micphoneName:" + micphoneName);

try

{

RecordedClip = Microphone.Start(micphoneName, false, 60, 16000);

ConnectSpeechWebSocket();

}

catch(Exception ex)

{

Debug.LogError(ex.Message);

}

}

public void StopSpeech()

{

Microphone.End(micphoneName);

Debug.Log("识别结束,停止录音");

}

void ConnectSpeechWebSocket()

{

try

{

speechWebSocket = new WebSocket(GetWebSocketUrl());

}

catch (Exception ex)

{

UnityEngine.Debug.LogError(ex.Message);

return;

}

speechWebSocket.OnOpen += (sender, e) =>

{

Debug.Log("OnOpen");

speechWebSocket.OnClose += OnWebSocketClose;

};

speechWebSocket.OnMessage += OnInitMessage;

speechWebSocket.OnError += OnError;

speechWebSocket.ConnectAsync();

StartCoroutine(SendVoiceData());

}

void OnWebSocketClose(object sender, CloseEventArgs e)

{

Debug.Log("OnWebSocketClose");

}

private static Queue<string> messageQueue = new Queue<string>();

void OnInitMessage(object sender, MessageEventArgs e)

{

UnityEngine.Debug.Log("qqqqqqqqqqqqqWebSocket数据返回:" + e.Data);

messageQueue.Enqueue(e.Data);

}

private void MainThreadOnMessage(string message)

{

try

{

XFResponse response = JsonConvert.DeserializeObject<XFResponse>(message);

if (0 != response.code)

{

return;

}

if (response.action.Equals("result"))

{

var result = ParseXunfeiRecognitionResult(response.data);

if(result.IsFinal)

{

Debug.Log("Text最终:" + result.Text);

resultCallback?.Invoke(result.Text);

}else

{

Debug.Log("Text中间:" + result.Text);

}

}

}

catch (Exception ex)

{

Debug.LogError(ex.Message);

}

}

void OnError(object sender, ErrorEventArgs e)

{

UnityEngine.Debug.Log("WebSoclet:发生错误:" + e.Message);

}

public SpeechRecognitionResult ParseXunfeiRecognitionResult(string dataJson)

{

StringBuilder builder = new StringBuilder();

SpeechRecognitionResult res = new SpeechRecognitionResult();

try

{

JsonData data = JsonMapper.ToObject(dataJson);

JsonData cn = data["cn"];

JsonData st = cn["st"];

if (st["ed"].ToString().Equals("0"))

{

res.IsFinal = false;

}

else

{

res.IsFinal = true;

}

JsonData rtArry = st["rt"];

foreach (JsonData rtObject in rtArry)

{

JsonData wsArr = rtObject["ws"];

foreach (JsonData wsObject in wsArr)

{

JsonData cwArr = wsObject["cw"];

foreach (JsonData cwObject in cwArr)

{

builder.Append(cwObject["w"].ToString());

}

}

}

}catch(Exception ex)

{

Debug.LogError(ex.Message);

}

res.Text = builder.ToString();

return res;

}

void SendData(byte[] voiceData)

{

Debug.Log("SendData:" + voiceData.Length + ",time:" + Time.realtimeSinceStartup);

if (speechWebSocket.ReadyState != WebSocketState.Open)

{

return;

}

try

{

if (speechWebSocket != null && speechWebSocket.IsAlive)

{

speechWebSocket.SendAsync(voiceData, success =>

{

if (success)

{

UnityEngine.Debug.Log("WebSoclet:发送成功:" + voiceData.Length);

}

else

{

UnityEngine.Debug.Log("WebSoclet:发送失败:");

}

});

}

}

catch

{

}

}

void SendEndMsg(System.Action callback)

{

string endMsg = "{\"end\": true}";

byte[] data = Encoding.UTF8.GetBytes(endMsg);

try

{

if (speechWebSocket != null && speechWebSocket.IsAlive)

{

speechWebSocket.SendAsync(data, success =>

{

if (success)

{

UnityEngine.Debug.Log("WebSoclet:发送END成功:" + data.Length);

}

else

{

UnityEngine.Debug.Log("WebSoclet:发送END失败:");

}

callback?.Invoke();

});

}

}

catch

{

}

}

IEnumerator SendVoiceData()

{

yield return new WaitUntil(()=> (speechWebSocket.ReadyState == WebSocketState.Open));

yield return new WaitWhile(() => Microphone.GetPosition(micphoneName) <= 0);

float t = 0;

int position = Microphone.GetPosition(micphoneName);

const float waitTime = 0.04f;//每隔40ms发送音频

int lastPosition = 0;

const int Maxlength = 640;//最大发送长度

//Debug.Log("position:" + position + ",samples:" + RecordedClip.samples);

while (position < RecordedClip.samples && speechWebSocket.ReadyState == WebSocketState.Open)

{

t += waitTime;

yield return new WaitForSecondsRealtime(waitTime);

if (Microphone.IsRecording(micphoneName)) position = Microphone.GetPosition(micphoneName);

//Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);

if (position <= lastPosition)

{

Debug.LogWarning("字节流发送完毕!强制结束!");code>

break;

}

int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;

byte[] date = GetClipData(lastPosition, length, RecordedClip);

SendData(date);

lastPosition = lastPosition + length;

}

yield return new WaitForSecondsRealtime(waitTime);

SendEndMsg(null);

Microphone.End(micphoneName);

}

public byte[] GetClipData(int star, int length, AudioClip recordedClip)

{

float[] soundata = new float[length];

recordedClip.GetData(soundata, star);

int rescaleFactor = 32767;

byte[] outData = new byte[soundata.Length * 2];

for (int i = 0; i < soundata.Length; i++)

{

short temshort = (short)(soundata[i] * rescaleFactor);

byte[] temdata = BitConverter.GetBytes(temshort);

outData[i * 2] = temdata[0];

outData[i * 2 + 1] = temdata[1];

}

return outData;

}

private string GetWebSocketUrl()

{

string appid = "appid";

string ts = GetCurrentUnixTimestampMillis().ToString();

string baseString = appid + ts;

string md5 = GetMD5Hash(baseString);

UnityEngine.Debug.Log("baseString:" + baseString + ",md5:" + md5);

string sha1 = CalculateHmacSha1(md5, "appkey");

string signa = sha1;

string url = string.Format("ws://rtasr.xfyun.cn/v1/ws?appid={0}&ts={1}&signa={2}", appid, ts, signa);

UnityEngine.Debug.Log(url);

return url;

}

private long GetCurrentUnixTimestampMillis()

{

DateTime unixStartTime = new DateTime(1970, 1, 1).ToLocalTime();

DateTime now = DateTime.Now;// DateTime.UtcNow;

TimeSpan timeSpan = now - unixStartTime;

long timestamp = (long)timeSpan.TotalSeconds;

return timestamp;

}

public string GetMD5Hash(string input)

{

MD5 md5Hasher = MD5.Create();

byte[] data = md5Hasher.ComputeHash(Encoding.Default.GetBytes(input));

StringBuilder sBuilder = new StringBuilder();

for (int i = 0; i < data.Length; i++)

{

sBuilder.Append(data[i].ToString("x2"));

}

return sBuilder.ToString();

}

public string CalculateHmacSha1(string data, string key)

{

HMACSHA1 hmac = new HMACSHA1(Encoding.UTF8.GetBytes(key));

byte[] hashBytes = hmac.ComputeHash(Encoding.UTF8.GetBytes(data));

return Convert.ToBase64String(hashBytes);

}

private void Update()

{

if(messageQueue.Count > 0)

{

MainThreadOnMessage(messageQueue.Dequeue());

}

}

}

Json解析类。

[Serializable]

public struct XFResponse

{

public string action;

public int code;

public string data;

public string desc;

public string sid;

}

[Serializable]

public struct SpeechRecognitionResult

{

public string Text;

public bool IsFinal;

}

值得注意的问题。

1、Microphone使用时传默认设备名比传null好使

2、握手阶段时间戳用的是秒(不是毫秒)

3、上传结束标志时,也要间隔40ms,否则讯飞像是没收到一样

4、如果Microphone.devices的长度为0,电脑确实又有麦克风设备,那么可能是麦克风的名字是中文导致的

遗留问题:

yield return new WaitForSecondsRealtime(0.04f)实际间隔时间0.1s左右,导致消息发送得很慢,语音识别慢。

2024.5.24更新第二篇,有效解决消息发送慢,识别慢的问题

2024.6.19更新:取消协程中发送数据,直接在Update中发送。解决消息发送很慢问题

private void Update()

{

if (isRunning)

{

byte[] voiceData = GetVoiveData();

if (voiceData != null)

{

SendData(voiceData);

}

}

if (messageQueue.Count > 0)

{

MainThreadOnMessage(messageQueue.Dequeue());

}

}

private int last_length = -1;

private float[] volumeData = new float[9999];

private short[] intData = new short[9999];

bool isRunning;

private byte[] GetVoiveData()

{

if (RecordedClip == null)

{

return null;

}

int new_length = Microphone.GetPosition(null);

if (new_length == last_length)

{

if (Microphone.devices.Length == 0)

{

isRunning = false;

}

return null;

}

int length = new_length - last_length;

int offset = last_length + 1;

last_length = new_length;

if (offset < 0)

{

return null;

}

if (length < 0)

{

float[] temp = new float[RecordedClip.samples];

RecordedClip.GetData(temp, 0);

int lengthTail = RecordedClip.samples - offset;

int lengthHead = new_length + 1;

try

{

Array.Copy(temp, offset, volumeData, 0, lengthTail);

Array.Copy(temp, 0, volumeData, lengthTail + 1, lengthHead);

length = lengthTail + lengthHead;

}

catch (Exception)

{

return null;

}

}

else

{

if (length > volumeData.Length)

{

volumeData = new float[length];

intData = new short[length];

}

RecordedClip.GetData(volumeData, offset);

}

byte[] bytesData = new byte[length * 2];

int rescaleFactor = 32767; //to convert float to Int16

for (int i = 0; i < length; i++)

{

intData[i] = (short)(volumeData[i] * rescaleFactor);

byte[] byteArr = BitConverter.GetBytes(intData[i]);

byteArr.CopyTo(bytesData, i * 2);

}

return bytesData;

}



声明

本文内容仅代表作者观点,或转载于其他网站,本站不以此文作为商业用途
如有涉及侵权,请联系本站进行删除
转载本站原创文章,请注明来源及作者。