語音算法中,如果你平常用C進行實現, 你會發現很累,要自己造很多輪子。現在我看看c++中算法以類的形式是怎麼實現的。
vad是一個特別簡單麼,明瞭的算法,比較適合入門。
https://github.com/robin1001/vad
main函數
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <vector>
#include "parse-option.h"
#include "wav.h"
#include "vad.h"
int main(int argc, char *argv[]) {
const char *usage = "Apply energy vad for input wav file\n"
"Usage: vad-test wav_in_file\n";
ParseOptions po(usage);
float frame_len = 0.025; // 25 ms
po.Register("frame-len", &frame_len, "frame length for mvdr");
float frame_shift = 0.01; // 10ms
po.Register("frame-shift", &frame_shift, "frame shift for mvdr");
float energy_thresh = 1.5e7;
po.Register("energy-thresh", &energy_thresh,
"energy threshold for energy based vad");
int sil_to_speech_trigger = 3;
po.Register("sil-to-speech-trigger", &sil_to_speech_trigger,
"num frames for silence to speech trigger");
int speech_to_sil_trigger = 10;
po.Register("speech-to-sil-trigger", &speech_to_sil_trigger,
"num frames for speech to silence trigger");
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
std::string wav_in = po.GetArg(1),
wav_out = po.GetArg(2);
WavReader reader(wav_in.c_str());
printf("input file %s info: \n"
"sample_rate %d \n"
"channels %d \n"
"bits_per_sample_ %d \n",
wav_in.c_str(),
reader.SampleRate(),
reader.NumChannel(),
reader.BitsPerSample());
int sample_rate = reader.SampleRate();
int num_sample = reader.NumSample();
int num_point_per_frame = (int)(frame_len * sample_rate);
int num_point_shift = (int)(frame_shift * sample_rate);
float *data = (float *)calloc(sizeof(float), num_sample);
// Copy first channel
for (int i = 0; i < num_sample; i++) {
data[i] = reader.Data()[i * reader.NumChannel()];
}
Vad vad(energy_thresh, sil_to_speech_trigger, speech_to_sil_trigger);
int num_frames = (num_sample - num_point_per_frame) / num_point_shift + 1;
std::vector<int> vad_reslut;
int num_speech_frames = 0;
for (int i = 0; i < num_sample; i += num_point_shift) {
// last frame
if (i + num_point_per_frame > num_sample) break;
int tags = vad.IsSpeech(data+i, num_point_per_frame) ? 1 : 0;
vad_reslut.push_back(tags);
if (tags == 1) num_speech_frames++;
printf("%f %d \n", float(i) / sample_rate, tags);
}
int num_speech_sample =
(num_speech_frames - 1) * num_point_shift + num_point_per_frame;
float *speech_data = (float *)calloc(sizeof(float), num_speech_sample);
int speech_cur = 0;
for (int i = 0; i < vad_reslut.size(); i++) {
// speech
if (vad_reslut[i] == 1) {
memcpy(speech_data + speech_cur * num_point_shift,
data + i * num_point_shift,
num_point_per_frame * sizeof(float));
speech_cur++;
}
}
WavWriter writer(speech_data, num_speech_sample, 1,
reader.SampleRate(), reader.BitsPerSample());
writer.Write(wav_out.c_str());
free(data);
free(speech_data);
return 0;
}
vad的子函數
#include <assert.h>
typedef enum {
kSpeech,
kSilence
} VadState;
class Vad {
public:
Vad(float energy_thresh, int silence_to_speech_thresh, int speech_to_sil_thresh):
energy_thresh_(energy_thresh),
silence_to_speech_thresh_(silence_to_speech_thresh),
speech_to_sil_thresh_(speech_to_sil_thresh),
silence_frame_count_(0), speech_frame_count_(0),
frame_count_(0), state_(kSilence) {
}
Vad() {
Reset();
}
void Reset() {
silence_frame_count_ = 0;
speech_frame_count_ = 0;
frame_count_ = 0;
state_ = kSilence;
}
// return 1 if current frame is speech
bool IsSpeech(float *data, int num_point) {
float energy = 0.0;
bool is_voice = false;
for (int i = 0; i < num_point; i++) {
energy += data[i] * data[i];
}
if (energy > energy_thresh_) is_voice = true;
switch (state_) {
case kSilence:
if (is_voice) {
speech_frame_count_++;
if (speech_frame_count_ >= silence_to_speech_thresh_) {
state_ = kSpeech;
silence_frame_count_ = 0;
}
} else {
speech_frame_count_ = 0;
}
break;
case kSpeech:
if (!is_voice) {
silence_frame_count_++;
if (silence_frame_count_ >= speech_to_sil_thresh_) {
state_ = kSilence;
speech_frame_count_ = 0;
}
} else {
silence_frame_count_ = 0;
}
break;
default:
assert(0);
}
if (state_ == kSpeech) return true;
else return false;
}
private:
float energy_thresh_;
int silence_to_speech_thresh_;
int speech_to_sil_thresh_;
int silence_frame_count_;
int speech_frame_count_;
int frame_count_;
VadState state_;
};