以vad爲例在C++中展示語音信號處理的算法以class形式的實現方式

語音算法中,如果你平常用C進行實現, 你會發現很累,要自己造很多輪子。現在我看看c++中算法以類的形式是怎麼實現的。

vad是一個特別簡單麼,明瞭的算法,比較適合入門。 

https://github.com/robin1001/vad

 

main函數

 

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <vector>

#include "parse-option.h"
#include "wav.h"
#include "vad.h"

int main(int argc, char *argv[]) {
    const char *usage = "Apply energy vad for input wav file\n"
                        "Usage: vad-test wav_in_file\n";
    ParseOptions po(usage);

    float frame_len = 0.025; // 25 ms
    po.Register("frame-len", &frame_len, "frame length for mvdr");
    float frame_shift = 0.01; // 10ms
    po.Register("frame-shift", &frame_shift, "frame shift for mvdr");
    float energy_thresh = 1.5e7;
    po.Register("energy-thresh", &energy_thresh, 
            "energy threshold for energy based vad");
    int sil_to_speech_trigger = 3;
    po.Register("sil-to-speech-trigger", &sil_to_speech_trigger,
            "num frames for silence to speech trigger");
    int speech_to_sil_trigger = 10;
    po.Register("speech-to-sil-trigger", &speech_to_sil_trigger,
            "num frames for speech to silence trigger");

    po.Read(argc, argv);

    if (po.NumArgs() != 2) {
        po.PrintUsage();
        exit(1);
    }

    std::string wav_in = po.GetArg(1), 
         wav_out = po.GetArg(2);

    WavReader reader(wav_in.c_str());

    printf("input file %s info: \n"
           "sample_rate %d \n"
           "channels %d \n"
           "bits_per_sample_ %d \n",
           wav_in.c_str(),
           reader.SampleRate(), 
           reader.NumChannel(),
           reader.BitsPerSample());
    
    int sample_rate = reader.SampleRate();
    int num_sample = reader.NumSample();
    int num_point_per_frame = (int)(frame_len * sample_rate);
    int num_point_shift = (int)(frame_shift * sample_rate);
   
    float *data = (float *)calloc(sizeof(float), num_sample);
    // Copy first channel
    for (int i = 0; i < num_sample; i++) {
        data[i] = reader.Data()[i * reader.NumChannel()];
    }

    Vad vad(energy_thresh, sil_to_speech_trigger, speech_to_sil_trigger);

    int num_frames = (num_sample - num_point_per_frame) / num_point_shift + 1;
    std::vector<int> vad_reslut;
    int num_speech_frames = 0;

    for (int i = 0; i < num_sample; i += num_point_shift) {
        // last frame 
        if (i + num_point_per_frame > num_sample) break;
        int tags = vad.IsSpeech(data+i, num_point_per_frame) ? 1 : 0;
        vad_reslut.push_back(tags);
        if (tags == 1) num_speech_frames++;
        printf("%f %d \n", float(i) / sample_rate, tags);
    }

    int num_speech_sample = 
             (num_speech_frames - 1) * num_point_shift + num_point_per_frame;
    float *speech_data = (float *)calloc(sizeof(float), num_speech_sample);
    
    int speech_cur = 0;
    for (int i = 0; i < vad_reslut.size(); i++) {
        // speech
        if (vad_reslut[i] == 1) {
            memcpy(speech_data + speech_cur * num_point_shift,
                   data + i * num_point_shift, 
                   num_point_per_frame * sizeof(float));
            speech_cur++;
        }
    }

    WavWriter writer(speech_data, num_speech_sample, 1, 
                        reader.SampleRate(), reader.BitsPerSample());

    writer.Write(wav_out.c_str());
    free(data);
    free(speech_data);
    return 0;
}

 

 

vad的子函數

#include <assert.h>

typedef enum {
    kSpeech,
    kSilence
} VadState;

class Vad {
public:
    Vad(float energy_thresh, int silence_to_speech_thresh, int speech_to_sil_thresh): 

        energy_thresh_(energy_thresh), 

        silence_to_speech_thresh_(silence_to_speech_thresh),

        speech_to_sil_thresh_(speech_to_sil_thresh),

        silence_frame_count_(0), speech_frame_count_(0), 

        frame_count_(0), state_(kSilence) {
    }

    Vad() {
        Reset();
    }

    void Reset() {
        silence_frame_count_ = 0; 
        speech_frame_count_ = 0;
        frame_count_ = 0; 
        state_ = kSilence; 
    }

    // return 1 if current frame is speech
    bool IsSpeech(float *data, int num_point) {
        float energy = 0.0; 
        bool is_voice = false;
        for (int i = 0; i < num_point; i++) {
            energy += data[i] * data[i];
        }
        if (energy > energy_thresh_) is_voice = true;
        switch (state_) {
            case kSilence:
                if (is_voice) {
                    speech_frame_count_++;
                    if (speech_frame_count_ >= silence_to_speech_thresh_) {
                        state_ = kSpeech;
                        silence_frame_count_ = 0;
                    }
                } else {
                    speech_frame_count_ = 0;
                }
                break;
            case kSpeech:
                if (!is_voice) {
                    silence_frame_count_++;
                    if (silence_frame_count_ >= speech_to_sil_thresh_) {
                        state_ = kSilence;
                        speech_frame_count_ = 0;
                    }
                } else {
                    silence_frame_count_ = 0;
                }
                break;
            default:
                assert(0);
        }
        if (state_ == kSpeech) return true;
        else return false;
    }
private:
    float energy_thresh_;        
    int silence_to_speech_thresh_;
    int speech_to_sil_thresh_;
    int silence_frame_count_;
    int speech_frame_count_;
    int frame_count_;
    VadState state_;
};

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章