智能客服客戶端程序開發

這個國慶節作死,答應別人在七天內做2個項目。智能客服項目是其中一個,當時是答應給隔壁兄弟團隊做的。他們說實在搞不定了,節後要上線,我就因爲隨口說了一句:“這有什麼難的”,結果禍從口出,這事情就落我頭上了。

錄音識別是智能客服大項目計劃裏面的一部分,簡單說就是客戶在跟用戶聊天的時候,實時從聲卡上抓取音頻數據,然後發送給阿里雲-智能語音交互識別成句子文本後,再發送給我們的智能客服助手服務器,生成提示展示給客戶人員。聽起來很簡單,我當時也這麼認爲,但做完這個之後,我覺得以後沒啥事還是少去隔壁團隊那兒串門,^^。

這個小程序,步驟上分3步:錄音,識別、提交。

1. 錄音

本來是想着用java來寫,開發能快點,而且java也有相應的AudioSystem音頻處理模塊,應該能輕鬆搞定。但是後來發現AudioSystem只能從麥克風這種音頻輸入設備讀取數據,要想從聲卡抓取音頻數據只能用C++調用WASAPI來獲取。網上也有不少例子,可以拿來借鑑。但這兒當時遇到的最大問題是,我抓取的音頻格式是PCM FLOAT 32位,做音頻處理的時候不方便,而且有些的音頻處理程序不支持(比如: java的AudioSystem),所以我轉換成PCM SIGNED 16位。另外,音頻數據在發送給識別服務之前要按照要求轉換成單軌方式,採樣率也要調整成16000Hz。音頻處理比較頭疼,開始打算用ffmpeg來處理,但是這個庫太重,用起來太複雜,後來就自己上網查資料,自己寫算法轉換的。

另外,網上也有種說法,就是說要在windows系統上進行錄音,需要打開立體聲混響設備。其實是不需要的,除非你想同時進行聲卡和麥克風錄音。僅僅從聲卡錄音,通過WASAPI夠了。

2. 識別

將聲卡抓取的音頻數據調整成能被識別的格式之後,就可以調用阿里雲提供的SDK進行發送了。因爲文檔比較齊全,所以這塊兒還是挺順利的。

3. 提交

因爲智能服務接口是restful的所以,我這兒只用libcurl來進行HTTP處理。libcurl當時在編譯成靜態庫後,連接時總是報 找不到函數入口。開始以爲是忘記加extern "C",加了之後還是報錯。最後,查了下資料,按照這個網頁(https://blog.csdn.net/libaineu2004/article/details/79736921)的指導操作了下就好了。

4. 參考代碼

其他模塊都沒什麼難度,主要是音頻處理部分,包括,位深轉換,採樣率轉換,單軌調整。主要還是基於網上的代碼做了少量的修改。

Capture.h
#pragma once

#pragma comment(lib,"avrt.lib")
#include <Audioclient.h>
#include <mmdeviceapi.h>
#include<iostream>
#include<avrt.h>
#include <vector>

typedef struct WAVE_HEADER {
	char    fccID[4];       //內容爲""RIFF
	unsigned long dwSize;   //最後填寫,WAVE格式音頻的大小
	char    fccType[4];     //內容爲"WAVE"
}WAVE_HEADER;

typedef struct WAVE_FMT {
	char    fccID[4];          //內容爲"fmt "
	unsigned long  dwSize;     //內容爲WAVE_FMT佔的字節數,爲16
	unsigned short wFormatTag; //如果爲PCM,改值爲 1
	unsigned short wChannels;  //通道數,單通道=1,雙通道=2
	unsigned long  dwSamplesPerSec;//採用頻率
	unsigned long  dwAvgBytesPerSec;/* ==dwSamplesPerSec*wChannels*uiBitsPerSample/8 */
	unsigned short wBlockAlign;//==wChannels*uiBitsPerSample/8
	unsigned short uiBitsPerSample;//每個採樣點的bit數,8bits=8, 16bits=16
}WAVE_FMT;

typedef struct WAVE_DATA {
	char    fccID[4];       //內容爲"data"
	unsigned long dwSize;   //==NumSamples*wChannels*uiBitsPerSample/8
}WAVE_DATA;

class Capture
{
public:
	Capture();

	int start();
	int stop();
	int cap(std::vector<BYTE> &buffer, int rate, int channels);
	int wav(std::vector<BYTE>& buffer, int rate, int channels);
private:
	bool adjustFormatTo16Bits(WAVEFORMATEX *pwfx);
	int read(std::vector<BYTE> &buffer);
	int resample(std::vector<BYTE> &buffer, int rate);
	int singleChannel(std::vector<BYTE> &buffer);
	

	IAudioCaptureClient * m_pAudioCaptureClient;
	IAudioClient * m_pAudioClient;
	WAVEFORMATEX * m_pwfx;
	IMMDevice* m_pMMDevice;
	size_t m_FrameSize;

	int m_SampleRate;
	int m_Channels;
};
Capture.cpp
#include "Capture.h"

#define RETURN_ON_ERROR(hr) if(FAILED(hr)){CoUninitialize();return -1;}
#define RETURN_ON_NULL(p) if(p==NULL){CoUninitialize();return -1;}
#define RETURN_ON_FALSE(b) if(!b){CoUninitialize();return -1;}

bool Capture::adjustFormatTo16Bits(WAVEFORMATEX *pwfx)
{
	bool ret=false;

	if (pwfx->wFormatTag == WAVE_FORMAT_IEEE_FLOAT)
	{
		pwfx->wFormatTag = WAVE_FORMAT_PCM;
		pwfx->wBitsPerSample = 16;		
		pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
		pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
		ret = true;
	}
	else if (pwfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE)
	{
		PWAVEFORMATEXTENSIBLE pEx = reinterpret_cast<PWAVEFORMATEXTENSIBLE>(pwfx);
		if (IsEqualGUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, pEx->SubFormat))
		{
			pEx->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
			pEx->Samples.wValidBitsPerSample = 16;			
			pwfx->wBitsPerSample = 16;
			pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
			pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
			ret = true;
		}
	}

	return ret;
}

Capture::Capture() {
	m_pAudioCaptureClient = NULL;
	m_pAudioClient = NULL;
	m_pMMDevice = NULL;
	m_pwfx = NULL;
	m_FrameSize = 0;

	m_SampleRate = -1;
	m_Channels = -1;
}

int Capture::start() {
	CoInitialize(NULL);
	IMMDeviceEnumerator *pMMDeviceEnumerator = NULL;
	HRESULT hr = CoCreateInstance(__uuidof(MMDeviceEnumerator), NULL, CLSCTX_ALL,
		__uuidof(IMMDeviceEnumerator), (void**)&pMMDeviceEnumerator);
	RETURN_ON_ERROR(hr);

	hr = pMMDeviceEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &m_pMMDevice);
	RETURN_ON_ERROR(hr);

	pMMDeviceEnumerator->Release();

	hr = m_pMMDevice->Activate(__uuidof(IAudioClient), CLSCTX_ALL, NULL, (void**)&m_pAudioClient);
	RETURN_ON_ERROR(hr);

	REFERENCE_TIME hnsDefaultDevicePeriod(0);
	hr = m_pAudioClient->GetDevicePeriod(&hnsDefaultDevicePeriod, NULL);
	RETURN_ON_ERROR(hr);

	hr = m_pAudioClient->GetMixFormat(&m_pwfx);
	RETURN_ON_ERROR(hr);

	/*轉換成signed 16位編碼*/
	adjustFormatTo16Bits(m_pwfx);

	m_FrameSize = (m_pwfx->wBitsPerSample / 8)*m_pwfx->nChannels;	
	
	hr = m_pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, 0, 0, m_pwfx, 0);
	RETURN_ON_ERROR(hr);


	hr = m_pAudioClient->GetService(__uuidof(IAudioCaptureClient), (void**)&m_pAudioCaptureClient);
	RETURN_ON_ERROR(hr);

	hr = m_pAudioClient->Start();
	RETURN_ON_ERROR(hr);

	CoUninitialize();

	m_Channels = m_pwfx->nChannels;
	m_SampleRate = m_pwfx->nSamplesPerSec;
	return 0;
}

int Capture::stop() {
	if (m_pAudioClient)
	{
		m_pAudioClient->Stop();
		m_pAudioClient->Release();
		m_pAudioClient = NULL;
	}
	if (m_pwfx != NULL)
	{
		CoTaskMemFree(m_pwfx);
		m_pwfx = NULL;
	}
	if (m_pAudioCaptureClient != NULL)
	{
		m_pAudioCaptureClient->Release();
		m_pAudioCaptureClient = NULL;
	}
	return 0;
}

int Capture::cap(std::vector<BYTE> &buffer, int rate, int channels)
{
	read(buffer);
	resample(buffer, rate);
	singleChannel(buffer);
	return buffer.size();
}

int Capture::read(std::vector<BYTE> &buffer) {
	DWORD dwWaitResult;
	UINT32 nNextPacketSize(0);
	BYTE *pData = NULL;
	UINT32 framesAvailable;
	DWORD flags;
	CoInitialize(NULL);

	HRESULT hr = m_pAudioCaptureClient->GetBuffer(&pData, &framesAvailable, &flags, NULL, NULL);
	RETURN_ON_ERROR(hr);

	if (0 != framesAvailable)
	{
		buffer.insert(buffer.end(), pData, pData+framesAvailable * m_FrameSize);
	}
	m_pAudioCaptureClient->ReleaseBuffer(framesAvailable);

	CoUninitialize();
	return framesAvailable * m_FrameSize;
}

int Capture::resample(std::vector<BYTE>& buffer, int rate)
{
	if (m_SampleRate == rate)return buffer.size();
	if (m_pwfx == nullptr)return -1;
	std::vector<BYTE> resultBuffer;
	int bytes = m_pwfx->wBitsPerSample/8;
	int sampleCount = buffer.size() / bytes;
	int srcRate = m_pwfx->nSamplesPerSec;
	int dstRate = rate;
	int rateLen = srcRate / dstRate;
	
	if (rateLen == 1) return buffer.size();

	if (rateLen > 0) {
		short tempRead = 0;
		short tempSum = 0;
		int flag = 0;
		
		for (int i = 0; i < sampleCount; i++) {
			memcpy(&tempRead, buffer.data()+i*bytes, bytes);
			tempSum = tempSum + tempRead;
			flag++;
			if (flag == rateLen)
			{
				flag = 0;
				tempSum = tempSum / rateLen;
				resultBuffer.insert(resultBuffer.end(), ((BYTE*)&tempSum), ((BYTE*)&tempSum) + bytes);
				tempSum = 0;
			}
		}
	}
	else {
		rateLen = dstRate / srcRate;
		int tempRead1;
		int tempRead2;
		int tempSum;
		int tempAvgDiff;
		int tempWrite;
		int flag;

		for (int i = 0; i < (sampleCount-1); i++) {			
			memcpy(&tempRead1, buffer.data() + i * bytes, bytes);
			memcpy(&tempRead2, buffer.data() + i * bytes+ bytes, bytes);
			tempSum = tempRead2 - tempRead1;
			tempAvgDiff = tempSum / rateLen;
			tempWrite = tempRead1;
			flag = rateLen;
			do
			{
				tempWrite += tempAvgDiff;
				resultBuffer.insert(resultBuffer.end(), ((BYTE*)&tempWrite), ((BYTE*)&tempWrite) + bytes);
			} while (--flag);
		}
	}
	buffer.swap(resultBuffer);	
	return buffer.size();
}

int Capture::singleChannel(std::vector<BYTE>& buffer)
{
	if (m_Channels == 1) return buffer.size();

	size_t len = buffer.size() / 2;
	int bytes = m_pwfx->wBitsPerSample / 8;
	//std::vector<BYTE> singleBuffer(len);
	BYTE *singleBuffer = new BYTE[len];
	//singleBuffer.reserve(len);
	for (int i = 0; i < len/bytes; i++) {
		//singleBuffer.insert(singleBuffer.end(), buffer.data() + i*bytes * 2, buffer.data() + i*bytes * 2 + bytes);
		memcpy(singleBuffer+i*bytes, buffer.data()+i*(2*bytes), bytes);
	}
	
	buffer.assign(singleBuffer, singleBuffer + len);
	delete[] singleBuffer;
	return buffer.size();
}

int Capture::wav(std::vector<BYTE>& buffer, int rate, int channels)
{
	std::vector<BYTE> wavBuffer;

	WAVE_HEADER pcmHEADER;
	WAVE_FMT    pcmFMT;
	WAVE_DATA   pcmDATA;

	unsigned short m_pcmData;
	int dataSize = buffer.size();

	/* WAVE_HEADER */
	memcpy(pcmHEADER.fccID, "RIFF", strlen("RIFF"));
	memcpy(pcmHEADER.fccType, "WAVE", strlen("WAVE"));
	pcmHEADER.dwSize = 36 + dataSize;

	/* WAVE_FMT */
	memcpy(pcmFMT.fccID, "fmt ", strlen("fmt "));
	pcmFMT.dwSize = 16;
	pcmFMT.wFormatTag = 1;
	pcmFMT.wChannels = channels;
	pcmFMT.dwSamplesPerSec = rate;
	pcmFMT.uiBitsPerSample = 16;
	/* ==dwSamplesPerSec*wChannels*uiBitsPerSample/8 */
	pcmFMT.dwAvgBytesPerSec = pcmFMT.dwSamplesPerSec*pcmFMT.wChannels*pcmFMT.uiBitsPerSample / 8;
	/* ==wChannels*uiBitsPerSample/8 */
	pcmFMT.wBlockAlign = pcmFMT.wChannels*pcmFMT.uiBitsPerSample / 8;

	/* WAVE_DATA */
	memcpy(pcmDATA.fccID, "data", strlen("data"));
	pcmDATA.dwSize = dataSize;

	wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmHEADER, ((BYTE*)&pcmHEADER) + sizeof(WAVE_HEADER));
	wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmFMT, ((BYTE*)&pcmFMT) + sizeof(WAVE_FMT));
	wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmDATA, ((BYTE*)&pcmDATA) + sizeof(WAVE_DATA));
	wavBuffer.insert(wavBuffer.end(), buffer.begin(), buffer.end());
	buffer.swap(wavBuffer);
	return buffer.size();
}

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章