以vad爲例在C++中展示語音信號處理的算法以class形式的實現方式

原創

baye_DOA

2020-04-26 06:04

語音算法中，如果你平常用C進行實現，你會發現很累，要自己造很多輪子。現在我看看c++中算法以類的形式是怎麼實現的。

vad是一個特別簡單麼，明瞭的算法，比較適合入門。

https://github.com/robin1001/vad

main函數

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <vector>

#include "parse-option.h"
#include "wav.h"
#include "vad.h"

int main(int argc, char *argv[]) {
    const char *usage = "Apply energy vad for input wav file\n"
                        "Usage: vad-test wav_in_file\n";
    ParseOptions po(usage);

    float frame_len = 0.025; // 25 ms
    po.Register("frame-len", &frame_len, "frame length for mvdr");
    float frame_shift = 0.01; // 10ms
    po.Register("frame-shift", &frame_shift, "frame shift for mvdr");
    float energy_thresh = 1.5e7;
    po.Register("energy-thresh", &energy_thresh, 
            "energy threshold for energy based vad");
    int sil_to_speech_trigger = 3;
    po.Register("sil-to-speech-trigger", &sil_to_speech_trigger,
            "num frames for silence to speech trigger");
    int speech_to_sil_trigger = 10;
    po.Register("speech-to-sil-trigger", &speech_to_sil_trigger,
            "num frames for speech to silence trigger");

    po.Read(argc, argv);

    if (po.NumArgs() != 2) {
        po.PrintUsage();
        exit(1);
    }

    std::string wav_in = po.GetArg(1), 
         wav_out = po.GetArg(2);

    WavReader reader(wav_in.c_str());

    printf("input file %s info: \n"
           "sample_rate %d \n"
           "channels %d \n"
           "bits_per_sample_ %d \n",
           wav_in.c_str(),
           reader.SampleRate(), 
           reader.NumChannel(),
           reader.BitsPerSample());
    
    int sample_rate = reader.SampleRate();
    int num_sample = reader.NumSample();
    int num_point_per_frame = (int)(frame_len * sample_rate);
    int num_point_shift = (int)(frame_shift * sample_rate);
   
    float *data = (float *)calloc(sizeof(float), num_sample);
    // Copy first channel
    for (int i = 0; i < num_sample; i++) {
        data[i] = reader.Data()[i * reader.NumChannel()];
    }

    Vad vad(energy_thresh, sil_to_speech_trigger, speech_to_sil_trigger);

    int num_frames = (num_sample - num_point_per_frame) / num_point_shift + 1;
    std::vector<int> vad_reslut;
    int num_speech_frames = 0;

    for (int i = 0; i < num_sample; i += num_point_shift) {
        // last frame 
        if (i + num_point_per_frame > num_sample) break;
        int tags = vad.IsSpeech(data+i, num_point_per_frame) ? 1 : 0;
        vad_reslut.push_back(tags);
        if (tags == 1) num_speech_frames++;
        printf("%f %d \n", float(i) / sample_rate, tags);
    }

    int num_speech_sample = 
             (num_speech_frames - 1) * num_point_shift + num_point_per_frame;
    float *speech_data = (float *)calloc(sizeof(float), num_speech_sample);
    
    int speech_cur = 0;
    for (int i = 0; i < vad_reslut.size(); i++) {
        // speech
        if (vad_reslut[i] == 1) {
            memcpy(speech_data + speech_cur * num_point_shift,
                   data + i * num_point_shift, 
                   num_point_per_frame * sizeof(float));
            speech_cur++;
        }
    }

    WavWriter writer(speech_data, num_speech_sample, 1, 
                        reader.SampleRate(), reader.BitsPerSample());

    writer.Write(wav_out.c_str());
    free(data);
    free(speech_data);
    return 0;
}

vad的子函數

#include <assert.h>

typedef enum {
    kSpeech,
    kSilence
} VadState;

class Vad {
public:
    Vad(float energy_thresh, int silence_to_speech_thresh, int speech_to_sil_thresh): 

        energy_thresh_(energy_thresh), 

        silence_to_speech_thresh_(silence_to_speech_thresh),

        speech_to_sil_thresh_(speech_to_sil_thresh),

        silence_frame_count_(0), speech_frame_count_(0), 

        frame_count_(0), state_(kSilence) {
    }

    Vad() {
        Reset();
    }

    void Reset() {
        silence_frame_count_ = 0; 
        speech_frame_count_ = 0;
        frame_count_ = 0; 
        state_ = kSilence; 
    }

    // return 1 if current frame is speech
    bool IsSpeech(float *data, int num_point) {
        float energy = 0.0; 
        bool is_voice = false;
        for (int i = 0; i < num_point; i++) {
            energy += data[i] * data[i];
        }
        if (energy > energy_thresh_) is_voice = true;
        switch (state_) {
            case kSilence:
                if (is_voice) {
                    speech_frame_count_++;
                    if (speech_frame_count_ >= silence_to_speech_thresh_) {
                        state_ = kSpeech;
                        silence_frame_count_ = 0;
                    }
                } else {
                    speech_frame_count_ = 0;
                }
                break;
            case kSpeech:
                if (!is_voice) {
                    silence_frame_count_++;
                    if (silence_frame_count_ >= speech_to_sil_thresh_) {
                        state_ = kSilence;
                        speech_frame_count_ = 0;
                    }
                } else {
                    silence_frame_count_ = 0;
                }
                break;
            default:
                assert(0);
        }
        if (state_ == kSpeech) return true;
        else return false;
    }
private:
    float energy_thresh_;        
    int silence_to_speech_thresh_;
    int speech_to_sil_thresh_;
    int silence_frame_count_;
    int speech_frame_count_;
    int frame_count_;
    VadState state_;
};

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

以vad爲例在C++中展示語音信號處理的算法以class形式的實現方式

再談23種設計模式（3）：行爲型模式（學習筆記）

Power Automate Desktop 安裝完，登錄後老是提示one driver 錯誤

微前端學習筆記(4):從微前端到微模塊之EMP與hel-micro方案探索

微前端學習筆記（1）：微前端總體架構概述，從微服務發微

985 碩士程序員，空窗 4 個月沒有 Offer！

一文搞懂 Spring 循環依賴

賽博鬥地主——使用大語言模型扮演Agent智能體玩牌類遊戲。

VScode右鍵打開(添加到右鍵)

記一次 .NET某工控視覺自動化系統卡死分析

WindowsServer--SQL Server搭建主從同步實現讀寫分離 - 事務性分發

buttord數字濾波器-matlab

音頻文件：wav轉pcm

讀取wav文件對應的label python

讀取多個音頻及其label python

dataset的設置

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結