有純淨語音,有噪聲,怎樣合成不同信噪比下的混合語音呢?!
function mixwav_IBM(S,db,nois)
numChan = 64; %通道數
FS = 8000; %數據的採樣頻率
gamma = gen_gammaton(FS,numChan);
ft_env = fir1(1024,[70,400]/FS*2,'bandpass');
[v,fs] = wavread(['babble.wav']);
v = resample(v,FS,fs);
noi.wav = repmat(v,fix(FS*10/length(v))+1,1);
noi.wav = noi.wav(1:FS*10);
[~,gf] = fgammaton(noi.wav',gamma,FS,numChan);
noi.cch = cochleagram(gf');
[w,fsw] = wavread(['F50A1.WAV']);
w = resample(w,FS,fsw);
[~,gf] = fgammaton(w',gamma,FS,numChan);
scch = cochleagram(gf');
v = noi.wav;
nLen = min(length(w),length(v));
w = w(1:nLen); %w是純淨的語音
v = v(1:nLen); %v是噪聲
db = 0;%混合語音的信噪比是0db
[m,v] = wavmix(w,v,db);%m是混合語音的波形
winsize = 256;
shiftsize = 80;
window = hamming(winsize);
winc_w = getframe(w,winsize,shiftsize,window);%純淨語音分幀
winc_v = getframe(v,winsize,shiftsize,window); %噪聲分幀
winc_m = getframe(m,winsize,shiftsize,window); %混合語音分幀
w_fft = abs(fft(winc_w));%純淨語音的fft
v_fft = abs(fft(winc_v));%噪聲語音的fft
m_fft = abs(fft(winc_m));%混合語音的fft
amp = zeros(size(w_fft));%掩碼的size和原始頻譜圖的size一樣大
amp(w_fft>v_fft) = 1;
wavwrite(m,FS,'F50A1_babble.WAV');
figure;imagesc(w_fft);title('純淨語音')
figure;imagesc(v_fft);title('噪聲')
figure;imagesc(m_fft);title('混合語音')
figure;imagesc(amp);title('IBM')
figure;imagesc(m_fft.*amp);title('IBM蓋在混合語音語譜圖上得到降噪之後的語音語譜')
close(figure);
%如果正確,那麼將IBM蓋在混合語音的語譜圖上 則會得到和純淨語音一樣的語譜圖
%純淨語音的語譜圖與IBM之後的語譜還是不太一樣
end
function [m,v] = wavmix(w,v,db)
amp = sqrt(sum(w.^2)/sum(v.^2)/10^(db/10));
m = w + v*amp;
v = v*amp;
end
function winc = getframe(wav,winsize,shiftsize,window)
slen = length(wav);
numframe = ceil(slen/shiftsize);
winc = zeros(winsize,numframe);
for n = 1:numframe
st = (n-1)*shiftsize;
ed = min(st+winsize,slen);
winc(1:ed-st,n) = wav(st+1:ed);
winc(:,n) = winc(:,n).*window;
end
end
function gt = gen_gammaton(sampFreq, numChannel)
% Create gammatone filterbank
% Written by Yang Shao, and adapted by Xiaojia Zhao in Oct'11
if ~exist('sampFreq', 'var')
sampFreq = 8000; % sampling frequency, default is 8000
end
if ~exist('numChannel', 'var')
numChannel = 64; % number of channels, default is 64
end
stepBound=200000;
filterOrder=4; % filter order
phase(1:numChannel)=zeros(numChannel,1); % original phase
erb_b=hz2erb([50,sampFreq/2]); % upper and lower bound of ERB
erb=[erb_b(1):diff(erb_b)/(numChannel-1):erb_b(2)]; % ERB segment
cf=erb2hz(erb); % center frequency
b=1.019*24.7*(4.37*cf/1000+1); % rate of decay
gt=zeros(numChannel,stepBound);
tmp_t=[1:stepBound]/sampFreq;
for i=1:numChannel
%gain=10^((loudness(cf(i))-60)/20)/3*(2*pi*b(i)/sampFreq).^4;
gain = (2*pi*b(i)/sampFreq).^4/3;
gt(i,:)=gain*sampFreq^3*tmp_t.^(filterOrder-1).*exp(-2*pi*b(i)*tmp_t).*cos(2*pi*cf(i)*tmp_t+phase(i));
end
end
function erb=hz2erb(hz)
% Convert normal frequency scale in hz to ERB-rate scale.
% Units are number of Hz and number of ERBs.
% ERB stands for Equivalent Rectangular Bandwidth.
% Written by ZZ Jin, and adapted by DLW in Jan'07
erb=21.4*log10(4.37e-3*hz+1);
end
function hz=erb2hz(erb)
% Convert ERB-rate scale to normal frequency scale.
% Units are number of ERBs and number of Hz.
% ERB stands for Equivalent Rectangular Bandwidth.
% Written by ZZ Jin, and adapted by DLW in Jan'07
hz=(10.^(erb/21.4)-1)/4.37e-3;
end
function [f, tmp]=fgammaton(sig, gamma, sampFreq, numChannel)
% Filter input signal with a gammatone filterbank and generate GF features
% sig: input signal
% gamma: gammtone filter impulse responses
% sampFreq: sampling frequency
% numChannel: number of channels
% Written by Yang Shao, and adapted by Xiaojia Zhao in Oct'11
[ignore,stepBound]=size(sig);
d = 2^ceil(log2(length(sig)));
tmp =ifft((fft((ones(numChannel,1)*sig)',d).*fft(gamma(:,1:stepBound)',d)));
tmp = tmp(1:stepBound,:);
f=(abs(resample(abs(tmp),100,sampFreq)')).^(1/3);
end
function a = cochleagram(r, winLength,winShift)
% Generate a cochleagram from responses of a Gammatone filterbank.
% It gives the log energy of T-F units
% The first variable is required.
% winLength: window (frame) length in samples
% Written by ZZ Jin, and adapted by DLW in Jan'07
if nargin < 2
winLength = 320; % default window length in sample points which is 20 ms for 16 KHz sampling frequency
end
[numChan,sigLength] = size(r); % number of channels and input signal length
if nargin < 3
winShift = winLength/2; % frame shift (default is half frame)
end
increment = winLength/winShift; % special treatment for first increment-1 frames
M = floor(sigLength/winShift); % number of time frames
coswin = (1 + cos(2*pi*(0:winLength-1)/winLength - pi))/2; % calculate a raised cosine window
% calculate energy for each frame in each channel
a = zeros(numChan,M);
for m = 1:M
for i = 1:numChan
if m < increment % shorter frame lengths for beginning frames
a(i,m) = r(i,1:m*winShift)*r(i,1:m*winShift)';
else
startpoint = (m-increment)*winShift;
a(i,m) = (r(i,startpoint+1:startpoint+winLength)).*coswin*((r(i,startpoint+1:startpoint+winLength)).*coswin)';
% a(i,m) = sum(abs((r(i,startpoint+1:startpoint+winLength))).*coswin);%*((r(i,startpoint+1:startpoint+winLength)))';
end
end
end
end
經過掩碼掩蓋之後,那麼怎樣從頻譜圖合成降噪之後的語音呢?!
function wav = syn_mask(n,tt)
% n 噪聲譜,n = stft(wav)
% tt 爲估計的mask,語音部分爲1,其餘爲0,大小與n相同
if nargin < 2
tt = ones(size(n));
end
if size(tt,1)~=129
tt = tt';
end
if size(n,1)~=129
n = n';
end
wav = istft(n.*tt);
% ang = angle(n);
% wav = istft(exp(tt').*(cos(ang)+1i*sin(ang)))';
function wav = syn_spec(n,tt)
% n 噪聲譜,n = stft(wav)
% tt 爲估計的幅度譜,大小與n相同
if size(tt,1)==129
tt = tt';
end
%wav = istft(n.*tt);
ang = angle(n);
wav = istft((tt').*(cos(ang')+1i*sin(ang')))';`這裏寫代碼片`
function d = stft(x)
s = length(x);
f = 256;
win=[0.0800
0.0801
0.0806
0.0813
0.0822
0.0835
0.0850
0.0868
0.0889
0.0913
0.0939
0.0968
0.1000
0.1034
0.1071
0.1111
0.1153
0.1198
0.1245
0.1295
0.1347
0.1402
0.1459
0.1519
0.1581
0.1645
0.1712
0.1781
0.1852
0.1925
0.2001
0.2078
0.2157
0.2239
0.2322
0.2407
0.2494
0.2583
0.2673
0.2765
0.2859
0.2954
0.3051
0.3149
0.3249
0.3350
0.3452
0.3555
0.3659
0.3765
0.3871
0.3979
0.4087
0.4196
0.4305
0.4416
0.4527
0.4638
0.4750
0.4863
0.4976
0.5089
0.5202
0.5315
0.5428
0.5542
0.5655
0.5768
0.5881
0.5993
0.6106
0.6217
0.6329
0.6439
0.6549
0.6659
0.6767
0.6875
0.6982
0.7088
0.7193
0.7297
0.7400
0.7501
0.7601
0.7700
0.7797
0.7893
0.7988
0.8081
0.8172
0.8262
0.8350
0.8436
0.8520
0.8602
0.8683
0.8761
0.8837
0.8912
0.8984
0.9054
0.9121
0.9187
0.9250
0.9311
0.9369
0.9426
0.9479
0.9530
0.9579
0.9625
0.9669
0.9710
0.9748
0.9784
0.9817
0.9847
0.9875
0.9899
0.9922
0.9941
0.9958
0.9972
0.9983
0.9991
0.9997
1.0000
1.0000
0.9997
0.9991
0.9983
0.9972
0.9958
0.9941
0.9922
0.9899
0.9875
0.9847
0.9817
0.9784
0.9748
0.9710
0.9669
0.9625
0.9579
0.9530
0.9479
0.9426
0.9369
0.9311
0.9250
0.9187
0.9121
0.9054
0.8984
0.8912
0.8837
0.8761
0.8683
0.8602
0.8520
0.8436
0.8350
0.8262
0.8172
0.8081
0.7988
0.7893
0.7797
0.7700
0.7601
0.7501
0.7400
0.7297
0.7193
0.7088
0.6982
0.6875
0.6767
0.6659
0.6549
0.6439
0.6329
0.6217
0.6106
0.5993
0.5881
0.5768
0.5655
0.5542
0.5428
0.5315
0.5202
0.5089
0.4976
0.4863
0.4750
0.4638
0.4527
0.4416
0.4305
0.4196
0.4087
0.3979
0.3871
0.3765
0.3659
0.3555
0.3452
0.3350
0.3249
0.3149
0.3051
0.2954
0.2859
0.2765
0.2673
0.2583
0.2494
0.2407
0.2322
0.2239
0.2157
0.2078
0.2001
0.1925
0.1852
0.1781
0.1712
0.1645
0.1581
0.1519
0.1459
0.1402
0.1347
0.1295
0.1245
0.1198
0.1153
0.1111
0.1071
0.1034
0.1000
0.0968
0.0939
0.0913
0.0889
0.0868
0.0850
0.0835
0.0822
0.0813
0.0806
0.0801
0.0800];
c = 1;
h = 80;
% pre-allocate output array
d = complex(zeros((1+f/2),1+fix((s-f)/h)));
for b = 0:h:(s-f)
u = win.*x((b+1):(b+f));
t = fft(u);
d(:,c) = t(1:(1+f/2))';
c = c+1;
end;
function x = istft(d)
% X = istft(D, F, W, H) Inverse short-time Fourier transform.
% Performs overlap-add resynthesis from the short-time Fourier transform
% data in D. Each column of D is taken as the result of an F-point
% fft; each successive frame was offset by H points. Data is
% hamm-windowed at W pts.
% W = 0 gives a rectangular window; W as a vector uses that as window.
% dpwe 1994may24. Uses built-in 'ifft' etc.
% $Header: /homes/dpwe/public_html/resources/matlab/pvoc/RCS/istft.m,v 1.4 2009/01/07 04:20:00 dpwe Exp $
ftsize = 256;
h = 80;
s = size(d);
cols = s(2);
xlen = ftsize + (cols-1)*h;
x = zeros(1,xlen);
win=[0.0800
0.0801
0.0806
0.0813
0.0822
0.0835
0.0850
0.0868
0.0889
0.0913
0.0939
0.0968
0.1000
0.1034
0.1071
0.1111
0.1153
0.1198
0.1245
0.1295
0.1347
0.1402
0.1459
0.1519
0.1581
0.1645
0.1712
0.1781
0.1852
0.1925
0.2001
0.2078
0.2157
0.2239
0.2322
0.2407
0.2494
0.2583
0.2673
0.2765
0.2859
0.2954
0.3051
0.3149
0.3249
0.3350
0.3452
0.3555
0.3659
0.3765
0.3871
0.3979
0.4087
0.4196
0.4305
0.4416
0.4527
0.4638
0.4750
0.4863
0.4976
0.5089
0.5202
0.5315
0.5428
0.5542
0.5655
0.5768
0.5881
0.5993
0.6106
0.6217
0.6329
0.6439
0.6549
0.6659
0.6767
0.6875
0.6982
0.7088
0.7193
0.7297
0.7400
0.7501
0.7601
0.7700
0.7797
0.7893
0.7988
0.8081
0.8172
0.8262
0.8350
0.8436
0.8520
0.8602
0.8683
0.8761
0.8837
0.8912
0.8984
0.9054
0.9121
0.9187
0.9250
0.9311
0.9369
0.9426
0.9479
0.9530
0.9579
0.9625
0.9669
0.9710
0.9748
0.9784
0.9817
0.9847
0.9875
0.9899
0.9922
0.9941
0.9958
0.9972
0.9983
0.9991
0.9997
1.0000
1.0000
0.9997
0.9991
0.9983
0.9972
0.9958
0.9941
0.9922
0.9899
0.9875
0.9847
0.9817
0.9784
0.9748
0.9710
0.9669
0.9625
0.9579
0.9530
0.9479
0.9426
0.9369
0.9311
0.9250
0.9187
0.9121
0.9054
0.8984
0.8912
0.8837
0.8761
0.8683
0.8602
0.8520
0.8436
0.8350
0.8262
0.8172
0.8081
0.7988
0.7893
0.7797
0.7700
0.7601
0.7501
0.7400
0.7297
0.7193
0.7088
0.6982
0.6875
0.6767
0.6659
0.6549
0.6439
0.6329
0.6217
0.6106
0.5993
0.5881
0.5768
0.5655
0.5542
0.5428
0.5315
0.5202
0.5089
0.4976
0.4863
0.4750
0.4638
0.4527
0.4416
0.4305
0.4196
0.4087
0.3979
0.3871
0.3765
0.3659
0.3555
0.3452
0.3350
0.3249
0.3149
0.3051
0.2954
0.2859
0.2765
0.2673
0.2583
0.2494
0.2407
0.2322
0.2239
0.2157
0.2078
0.2001
0.1925
0.1852
0.1781
0.1712
0.1645
0.1581
0.1519
0.1459
0.1402
0.1347
0.1295
0.1245
0.1198
0.1153
0.1111
0.1071
0.1034
0.1000
0.0968
0.0939
0.0913
0.0889
0.0868
0.0850
0.0835
0.0822
0.0813
0.0806
0.0801
0.0800]';
for b = 0:h:(h*(cols-1))
ft = d(:,1+b/h)';
ft = [ft, conj(ft([((ftsize/2)):-1:2]))];
px = real(ifft(ft));
x((b+1):(b+ftsize)) = x((b+1):(b+ftsize))+px.*win;
end;