MFCC的核心部分主要是:
1)預加重:爲了消除發聲過程中,聲帶和嘴脣造成的效應,來補償語音信號受到發音系統所壓抑的高頻部分。並且能突顯高頻的共振峯。 簡單理解就是在頻域上面都乘以一個係數,這個係數跟頻率成正相關,所以高頻的幅值會有所提升。相當於高通濾波;
2)加窗:用於平滑信號,使用漢明窗加以平滑的話,相比於矩形窗函數,會減弱FFT以後旁瓣大小以及頻譜泄露。;
3)快速傅里葉變換(FFT):將時域信號轉化到頻域進行後續的頻率分析。主要包括幅度譜和功率譜。
4)梅爾濾波(Mel):因爲頻域信號有很多冗餘,濾波器組可以對頻域的幅值進行精簡,每一個頻段用一個值來表示。
5)離散餘弦變換(DCT):按照倒譜的定義,該步需要進行反傅里葉變換然後通過低通濾波器獲得最後的低頻信號。這裏使用DCT直接就可以獲取頻率譜的低頻信息。 由於濾波器之間是有重疊的,所以前面的獲得的能量值之間是具有相關性的,DCT還可以對數據進行降維壓縮和抽象,獲得最後的特徵參數。相比於傅里葉變換,離散餘弦變換的結果沒有虛部,更好計算。
6)差分:由於語音信號是時域連續的,分幀提取的特徵信息只反應了本幀語音的特性,爲了使特徵更能體現時域連續性,可以在特徵維度增加前後幀信息的維度。常用的是一階差分和二階差分。
本人的C++代碼實現:
#ifndef MFCC_H
#define MFCC_H
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "wav.h"
#include <iostream>
#include <vector>
#include <fstream>
#include <sstream>
using namespace std;
//DCT的具體實現
void Discrete_Cosine_Transform(int direction, int length, vector<double> &X)
{
double pi = 3.14159265358979323846;
vector<double> x(length,0);
for (int i = 0; i < length; i++)
{
x[i] = X[i];
}
for (int k = 0; k < length; k++)
{
double sum = 0;
if (direction == 1)
{
for (int n = 0; n < length; n++)
{
sum += ((k == 0) ? (sqrt(0.5)) : (1)) * x[n] * cos(pi * (n + 0.5) * k / length);
}
}
else if (direction == -1)
{
for (int n = 0; n < length; n++)
{
sum += ((n == 0) ? (sqrt(0.5)) : (1)) * x[n] * cos(pi * n * (k + 0.5) / length);
}
}
X[k] = sum * sqrt(2.0 / length);
}
}
void DCT(int direction, int length, vector<double> &X)
{
if (direction == 1 || direction == -1)
{
Discrete_Cosine_Transform(direction, length, X);
return;
}
//fprintf(stderr, "[DCT], [direction = {-1 (inversed transform), 1 (forward transform)}\n");
}
//FFT的具體實現
void Fast_Fourier_Transform(int direction, int length, vector<double> &Xr, vector<double> &Xi)
{
int log_length = (int)(log((double)length) / log(2.0));
double pi = 3.14159265358979323846;
for (int i = 0, j = 0; i < length; i++, j = 0)
{
for (int k = 0; k < log_length; k++)
{
j = (j << 1) | (1 & (i >> k));
}
if (j < i)
{
double t;
t = Xr[i];
Xr[i] = Xr[j];
Xr[j] = t;
t = Xi[i];
Xi[i] = Xi[j];
Xi[j] = t;
}
}
for (int i = 0; i < log_length; i++)
{
int L = (int)pow(2.0, i);
for (int j = 0; j < length - 1; j += 2 * L)
{
for (int k = 0; k < L; k++)
{
double argument = direction * -pi * k / L;
double xr = Xr[j + k + L] * cos(argument) - Xi[j + k + L] * sin(argument);
double xi = Xr[j + k + L] * sin(argument) + Xi[j + k + L] * cos(argument);
Xr[j + k + L] = Xr[j + k] - xr;
Xi[j + k + L] = Xi[j + k] - xi;
Xr[j + k] = Xr[j + k] + xr;
Xi[j + k] = Xi[j + k] + xi;
}
}
}
if (direction == -1)
{
for (int k = 0; k < length; k++)
{
Xr[k] /= length;
Xi[k] /= length;
}
}
}
void FFT(int direction, int length, vector<double> &Xr, vector<double> &Xi)
{
int log_length = log((double)length) / log(2.0);
if (direction != 1 && direction != -1)
{
//fprintf(stderr, "[FFT], [direction = {-1 (inversed transform), 1 (forward transform)}\n");
return;
}
if (1 << log_length != length)
{
//fprintf(stderr, "[FFT], [length must be a power of 2]\n");
return;
}
Fast_Fourier_Transform(direction, length, Xr, Xi);
}
//梅爾頻率範圍
double Mel_Scale(int direction, double x)
{
switch (direction)
{
case -1:
return 700.0 * (exp(x / 1125.0) - 1);
case 1:
return 1125.0 * log(1 + x / 700.0);
}
//fprintf(stderr, "[Mel_Scale], [direction = {-1 (inversed transform), 1 (forward transform)}\n");
return 0;
}
//獲取完整的MFCC特徵(無能量值),包括FFT、取絕對值、Mel濾波、取對數、DCT,最後返回feature_vector[]一幀的特徵向量
void MFCC(int length_frame, int length_DFT, int number_coefficients, int number_filterbanks, int sample_rate, vector<double> frame, vector<double> &feature_vector)
{
double max_Mels_frequency = Mel_Scale(1, sample_rate / 2);//採樣頻率範圍
double min_Mels_frequency = Mel_Scale(1, 300);
double interval = (max_Mels_frequency - min_Mels_frequency) / (number_filterbanks + 1);
//double *filterbank = new double[number_filterbanks];
vector<double> filterbank(number_filterbanks,0);
//double *Xr = new double[length_DFT];
vector<double> Xr(length_DFT,0);
//double *Xi = new double[length_DFT];
vector<double> Xi(length_DFT,0);
for (int i = 0; i < length_DFT; i++)
{
Xr[i] = (i < length_frame) ? (frame[i]) : (0);
Xi[i] = 0;
}
//FFT
FFT(1, length_DFT, Xr, Xi);
for (int i = 0; i < length_DFT / 2 + 1; i++)
{
double frequency = (sample_rate / 2) * i / (length_DFT / 2);
double Mel_frequency = Mel_Scale(1, frequency);
//取平方值
double power = (Xr[i] * Xr[i] + Xi[i] * Xi[i]) / length_frame;
//梅爾濾波
for (int j = 0; j < number_filterbanks; j++)
{
double frequency_boundary[] = { min_Mels_frequency + interval * (j + 0), min_Mels_frequency + interval * (j + 1), min_Mels_frequency + interval * (j + 2) };
if (frequency_boundary[0] <= Mel_frequency && Mel_frequency <= frequency_boundary[1])
{
double lower_frequency = Mel_Scale(-1, frequency_boundary[0]);
double upper_frequency = Mel_Scale(-1, frequency_boundary[1]);
filterbank[j] += power * (frequency - lower_frequency) / (upper_frequency - lower_frequency);
}
else if (frequency_boundary[1] <= Mel_frequency && Mel_frequency <= frequency_boundary[2])
{
double lower_frequency = Mel_Scale(-1, frequency_boundary[1]);
double upper_frequency = Mel_Scale(-1, frequency_boundary[2]);
filterbank[j] += power * (upper_frequency - frequency) / (upper_frequency - lower_frequency);
}
}
}
//取對數
for (int i = 0; i < number_filterbanks; i++)
{
filterbank[i] = log(filterbank[i]);
}
//DCT
DCT(1, number_filterbanks, filterbank);
//獲取MFCC特徵向量
for (int i = 0; i < number_coefficients; i++)
{
feature_vector[i] = filterbank[i];
}
}
int main()//參數是data_length採樣點的語音段
{
int stride = 256; //步長
int length_frame = 512; //幀長
int length_DFT = 512;//傅里葉點數
int number_coefficients = 13;//離散變換維度,最終得到3*number_coefficients維的特徵數據
int number_filterbanks = 26;//過濾器數量
//int signal = 0;
int number_feature_vectors;//該.wav有多少幀
int nSamplesPerSec;// 採樣頻率(每秒樣本數), 表示每個通道的播放速度
double pi = 3.14159265358979323846;
int wi;
for (wi = 0; wi <= 31 ; wi++)
{
string dir_path = "D://Profession_Projects//Lenovo_Projects//Abnormal_Sound//Materials//newWav//";
stringstream strs;
strs<<wi;
string strg= strs.str();//將int字符類型轉換成string類型
string addr = dir_path + strg + ".wav";
Wav wav(addr.c_str());
//Wav wav(data,data_length,samsize);
wav.WavToBuffer();
if(wav.thislabel == 1)
break;
nSamplesPerSec = wav.waveformatex.SampleRate;//類對象的成員變量的結構體成員變量
number_feature_vectors = (wav.length_buffer - length_frame) / stride + 1;
cout<<number_feature_vectors<<endl;
vector<vector<double> > feature_vector(number_feature_vectors,vector<double>(3 * number_coefficients,0));
// MFCC
for (int i = 0; i <= wav.length_buffer - length_frame; i += stride)
{
//double *frame = new double[length_frame];
vector<double> frame(length_frame,0);
// pre-emphasis,預加重
for (int j = 0; j < length_frame; j++)
{
if (i + j < wav.length_buffer)
{
frame[j] = wav.Get_Buffer(i + j) - 0.95 * wav.Get_Buffer(i + j - 1);
}
else
{
frame[j] = 0;
//frame[j] = 0;//另外一種方法
}
}
// windowing,加漢明窗
for (int j = 0; j < length_frame; j++)
{
frame[j] *= 0.54 - 0.46 * cos(2 * pi * j / (length_frame - 1));
}
MFCC(length_frame, length_DFT, number_coefficients, number_filterbanks, nSamplesPerSec, frame, feature_vector[i / stride]);//進行處理的是第i/stride幀,每幀長length_frame
}
//至此得到二維特徵向量feature_vector
// deltas,一階差分
for (int i = 0; i < number_feature_vectors; i++)
{
int prev = (i == 0) ? (0) : (i - 1);
int next = (i == number_feature_vectors - 1) ? (number_feature_vectors - 1) : (i + 1);
for (int j = 0; j < number_coefficients; j++)
{
feature_vector[i][number_coefficients + j] = (feature_vector[next][j] - feature_vector[prev][j]) / 2;
}
}
// delta-deltas,二階差分
for (int i = 0; i < number_feature_vectors; i++)
{
int prev = (i == 0) ? (0) : (i - 1);
int next = (i == number_feature_vectors - 1) ? (number_feature_vectors - 1) : (i + 1);
for (int j = number_coefficients; j < 2 * number_coefficients; j++)
{
feature_vector[i][number_coefficients + j] = (feature_vector[next][j] - feature_vector[prev][j]) / 2;
}
}
string waddr = dir_path + strg + ".txt";
FILE *file = fopen(waddr.c_str(), "wt");
//將.wav的MFCC特徵寫入到文件中,每幀一行。每行39維數據。
for (int i = 0; i < number_feature_vectors; i++)
{
for (int j = 0; j < 3 * number_coefficients; j++)
{
fprintf(file, "%lf ", feature_vector[i][j]);
}
fprintf(file, "\n");
}
fclose(file);
}
//signal = judge(feature_vector,number_feature_vectors,3 * number_coefficients);//if 1 : abnormal ; if 0 : normal
//citer++;
//cout<<citer<<endl;
//return signal;
}
#endif // MFCC_H
MFCC主要是用於提取語音信息的特徵,那我們怎麼先將語音信號處理成常用的數據類型呢?
#include <stdio.h>
#include <memory.h>
#include <iostream>
using namespace std;
typedef struct wave_tag
{
char ChunkID[4]; // "RIFF"標誌
unsigned int ChunkSize; // 文件長度(WAVE文件的大小, 不含前8個字節)
char Format[4]; // "WAVE"標誌
char SubChunk1ID[4]; // "fmt "標誌
unsigned int SubChunk1Size; // 過渡字節(不定)
unsigned short int AudioFormat; // 格式類別(10H爲PCM格式的聲音數據)
unsigned short int NumChannels; // 通道數(單聲道爲1, 雙聲道爲2)
unsigned int SampleRate; // 採樣率(每秒樣本數), 表示每個通道的播放速度
unsigned int ByteRate; // 波形音頻數據傳輸速率, 其值爲:通道數*每秒數據位數*每樣本的數據位數/8
unsigned short int BlockAlign; // 每樣本的數據位數(按字節算), 其值爲:通道數*每樣本的數據位值/8
unsigned short int BitsPerSample; // 每樣本的數據位數, 表示每個聲道中各個樣本的數據位數.
//char SubChunk2ID[4]; // 數據標記"data"
//unsigned int SubChunk2Size; // 語音數據的長度
} waveft;
class Wav
{
private:
char *buffer8;
short *buffer16;
float *buffer32;
public:
//bool recording;
unsigned char *data;//data中儲存音頻的有效數據
waveft waveformatex;
int length_buffer;
int length_wav;//音頻有效數據的長度(採樣點數)
int thislabel;
/************************************************************************/
Wav(const char *path)
{
//創建對象,並初始化對象的變量值
memset(&waveformatex, 0 ,sizeof(waveformatex));
length_buffer = 0;
length_wav = 0;
buffer8 = new char[length_buffer];
buffer16 = new short[length_buffer];
buffer32 = new float[length_buffer];
data = new unsigned char[length_wav];
thislabel = Load(path);
}
/************************************************************************/
void WavToBuffer()
{
switch (waveformatex.BitsPerSample)
{
case 8:
buffer8 = (char*)realloc(buffer8, sizeof(char)* (length_buffer = length_wav));
for (int i = 0; i < length_buffer; i++)
{
buffer8[i] = (char)data[i];
}
break;
case 16:
buffer16 = (short*)realloc(buffer16, sizeof(short)* (length_buffer = length_wav / 2));
for (int i = 0; i < length_buffer; i++)
{
buffer16[i] = (short)((data[2 * i + 1] << 8) | data[2 * i]);
}
break;
case 32:
buffer32 = (float*)realloc(buffer32, sizeof(float)* (length_buffer = length_wav / 4));
for (int i = 0; i < length_buffer; i++)
{
buffer32[i] = *(float*)&data[4 * i];
}
break;
}
}
/************************************************************************/
int Load(const char *path)
{
FILE *file = fopen(path, "rb");
//if (!file)
//{
//fprintf(stderr, "[Load] [%s not found]\n", path);
// return;
//}
int chunk;
fread(&waveformatex, sizeof(struct wave_tag), 1, file);
cout<<waveformatex.BitsPerSample<<endl;
cout<<waveformatex.NumChannels<<endl;
cout<<waveformatex.SampleRate<<endl;
fread(&chunk,4,1,file);
while(chunk != 0x61746164 && fread(&chunk,4,1,file) != EOF);
fread(&length_wav,sizeof(int),1,file);
data = (unsigned char*)realloc(data, length_wav);//申請length_wav大小的空間,空間的首地址爲data
cout<<length_wav<<endl;
fread(data, length_wav, 1, file);//將音頻的有效數據存儲到data中。
fclose(file);
if(waveformatex.BitsPerSample!=16 || waveformatex.NumChannels!=2 || waveformatex.SampleRate != 22050)
return 1;
else
return 0;
}
/************************************************************************/
double Get_Buffer(int index)
{
if (0 <= index && index < length_buffer)
{
switch (waveformatex.BitsPerSample)
{
case 8:
return (buffer8[index] + 0.5) / 127.5;
case 16:
return (buffer16[index] + 0.5) / 32767.5;
case 32:
return buffer32[index];
}
}
return 0;
}
};
參考文獻:
https://blog.csdn.net/liuliqun520/article/details/80538259,
https://blog.csdn.net/xueyingxue001/article/details/53183757