這兩週主要把DeepLearnToolbox-master裏的源碼研讀了下,大致框架基本理解,但是細節實現處還不是很懂。經過與同學的討論,基本理解了。然後嘗試實現一個卷積神經網絡,參考了UFLDL教程與習題代碼和網上的一些資源,嘗試寫了個一層的網絡,即輸入層+卷積層+池化層+全連接層。調試了很久,還有點問題。
由於在 MNIST 數字識別中,有 個不同的類別。所以採用softmax迴歸。其代價函數爲:
詳細的可參考ufldl softmax迴歸
下面爲實現的代碼:
my_cnn.m:
%%STEP0: 加載數據集與初始化參數
imageDim=28; %圖像的尺寸
numClasses=10; %標籤的類別,MNIST數據集一共10類
filterDim=9; %濾波器核的尺寸
numFilters=20; %濾波器的個數,也是特徵數
poolDim=2; %池化層核的尺寸
%加載數據集
%訓練數據集
addpath ./commom/;
images=loadMNISTImages('./common/trian-images');
images=reshape(images,imageDim,imageDim,[]);
labels = loadMNISTLabels('./common/train-labels');
%測試數據集
testImages = loadMNISTImages('./common/t10k-images');
testImages = reshape(testImages,imageDim,imageDim,[]);
testLabels = loadMNISTLabels('./common/t10k-labels');
%初始化參數
%參數初始化比較重要,權重值一般初始化爲均值爲零方差爲0.1的高斯分佈,偏置值一般初始化爲零即可
%卷積層的權重與偏置參數
Wc=1e-1*randn(filterDim,filterDim,numFilters);
bc = zeros(numFilters, 1);
%卷積層的輸出圖像的尺寸
outDim = imageDim - filterDim + 1; %
%池化層輸出圖像的尺寸
outDim=outDim/poolDim;
%隱藏層的尺寸
hiddenSize=outDim^2*numFilters;
%初始化全連接層的權重與偏置
r = sqrt(6) / sqrt(numClasses+hiddenSize+1);
Wd = rand(numClasses, hiddenSize) * 2 * r - r;
bd = zeros(numClasses, 1);
%% STEP 1: 學習參數
epochs = 3; %將訓練集分爲3批
alpha = 1e-1;
minibatch = 256;
% 設置梯度下降相關的參數
Wc_velocity = zeros(size(Wc));
Wd_velocity = zeros(size(Wd));
bc_velocity = zeros(size(bc));
bd_velocity = zeros(size(bd));
m = length(labels);
%循環處理圖像
it = 0;
for e = 1:epochs
% 隨機改變序列以快速採樣minibatch
rp = randperm(m);
for s=1:minibatch:(m-minibatch+1)
it = it + 1;
% 隨機選擇下一個要處理的minibatch
mb_data = images(:,:,rp(s:s+minibatch-1));
mb_labels = labels(rp(s:s+minibatch-1));
%
numImages = length(mb_labels);
convDim = imageDim-filterDim+1; % dimension of convolved output
outputDim = (convDim)/poolDim; % dimension of subsampled output
%前向傳導
activations = cnnConvolve(filterDim, numFilters, mb_data, Wc, bc);%sigmoid(wx+b)
activationsPooled = cnnPool(poolDim, activations);
activationsPooled = reshape(activationsPooled,[],numImages);
h = exp(bsxfun(@plus,Wd * activationsPooled,bd)); %softmax的cost function裏的h和probs
probs = bsxfun(@rdivide,h,sum(h,1));
%計算 Cost Function
logp = log(probs);
index = sub2ind(size(logp),mb_labels',1:size(probs,2));
ceCost = -sum(logp(index));
wCost = lambda/2 * (sum(Wd(:).^2)+sum(Wc(:).^2));
cost = ceCost/numImages + wCost;
%反向傳播
output = zeros(size(probs));
output(index) = 1;
DeltaSoftmax = probs - output; %輸出層的靈敏度
DeltaPool = reshape(Wd' * DeltaSoftmax,outputDim,outputDim,numFilters,numImages);
DeltaUnpool = zeros(convDim,convDim,numFilters,numImages);
%,求池化層的靈敏度,用kron函數實現上採樣
for imNum = 1:numImages
for FilterNum = 1:numFilters
unpool = DeltaPool(:,:,FilterNum,imNum);
DeltaUnpool(:,:,FilterNum,imNum) = kron(unpool,ones(poolDim))./(poolDim ^ 2);
end
end
DeltaConv = DeltaUnpool .* activations .* (1 - activations); %卷積層的靈敏度
%計算梯度
Wd_grad = (1./numImages) .* DeltaSoftmax*activationsPooled'+lambda*Wd;
bd_grad = (1./numImages) .* sum(DeltaSoftmax,2);
bc_grad = zeros(size(bc));
Wc_grad = zeros(filterDim,filterDim,numFilters);
for filterNum = 1:numFilters
error = DeltaConv(:,:,filterNum,:);
bc_grad(filterNum) = (1./numImages) .* sum(error(:));
end
%卷積層的靈敏度
for filterNum = 1:numFilters
for imNum = 1:numImages
error = DeltaConv(:,:,filterNum,imNum);
DeltaConv(:,:,filterNum,imNum) = rot90(error,2);
end
end
for filterNum = 1:numFilters
for imNum = 1:numImages
Wc_grad(:,:,filterNum) = Wc_grad(:,:,filterNum) + conv2(mb_data(:,:,imNum),DeltaConv(:,:,filterNum,imNum),'valid');
end
end
Wc_grad = (1./numImages) .* Wc_grad + lambda*Wc;
%% STEP2 更新參數
%%% YOUR CODE HERE %%%
Wc_velocity = mom*Wc_velocity+alpha*Wc_grad;
Wc = Wc - Wc_velocity;
Wd_velocity = mom*Wd_velocity+alpha*Wd_grad;
Wd = Wd - Wd_velocity;
bc_velocity = mom*bc_velocity+alpha*bc_grad;
bc = bc - bc_velocity;
bd_velocity = mom*bd_velocity+alpha*bd_grad;
bd = bd - bd_velocity;
fprintf('Epoch %d: Cost on iteration %d is %f\n',e,it,cost);
end;
%
alpha = alpha/2.0;
%test at each epoch
activations = cnnConvolve(filterDim, numFilters, testImages, Wc, bc); %sigmoid(wx+b)
activationsPooled = cnnPool(poolDim, activations);
activationsPooled = reshape(activationsPooled,[],length(testLabels));
h = exp(bsxfun(@plus,Wd * activationsPooled,bd));
probs = bsxfun(@rdivide,h,sum(h,1));
[~,preds] = max(probs,[],1);
preds = preds';
acc = sum(preds==testLabels)/length(preds);
fprintf('Accuracy is %f\n',acc);
end;
cnnConvolve.m
function convolvedFeatures = cnnConvolve(filterDim, numFilters, images, W, b)
%cnnConvolve Returns the convolution of the features given by W and b with
%the given images
%
% Parameters:
% filterDim - filter (feature) dimension
% numFilters - number of feature maps
% images - large images to convolve with, matrix in the form
% images(r, c, image number)
% W, b - W, b for features from the sparse autoencoder
% W is of shape (filterDim,filterDim,numFilters)
% b is of shape (numFilters,1)
%
% Returns:
% convolvedFeatures - matrix of convolved features in the form
% convolvedFeatures(imageRow, imageCol, featureNum, imageNum)
numImages = size(images, 3);
imageDim = size(images, 1);
convDim = imageDim - filterDim + 1;
convolvedFeatures = zeros(convDim, convDim, numFilters, numImages);
% Instructions:
% Convolve every filter with every image here to produce the
% (imageDim - filterDim + 1) x (imageDim - filterDim + 1) x numFeatures x numImages
% matrix convolvedFeatures, such that
% convolvedFeatures(imageRow, imageCol, featureNum, imageNum) is the
% value of the convolved featureNum feature for the imageNum image over
% the region (imageRow, imageCol) to (imageRow + filterDim - 1, imageCol + filterDim - 1)
%
% Expected running times:
% Convolving with 100 images should take less than 30 seconds
% Convolving with 5000 images should take around 2 minutes
% (So to save time when testing, you should convolve with less images, as
% described earlier)
for imageNum = 1:numImages
for filterNum = 1:numFilters
% convolution of image with feature matrix
convolvedImage = zeros(convDim, convDim);
% Obtain the feature (filterDim x filterDim) needed during the convolution
%%% YOUR CODE HERE %%%
filter = squeeze(W(:,:,filterNum));
% Flip the feature matrix because of the definition of convolution, as explained later
filter = rot90(squeeze(filter),2);
% Obtain the image
im = squeeze(images(:, :, imageNum));
% Convolve "filter" with "im", adding the result to convolvedImage
% be sure to do a 'valid' convolution
%%% YOUR CODE HERE %%%
convolvedImage = conv2(im,filter,'valid');
% Add the bias unit
% Then, apply the sigmoid function to get the hidden activation
%%% YOUR CODE HERE %%%
convolvedImage = bsxfun(@plus,convolvedImage,b(filterNum));
convolvedImage = 1 ./ (1+exp(-convolvedImage));
convolvedFeatures(:, :, filterNum, imageNum) = convolvedImage;
end
end
end
cnnPool.m
function pooledFeatures = cnnPool(poolDim, convolvedFeatures)
%cnnPool Pools the given convolved features
%
% Parameters:
% poolDim - dimension of pooling region
% convolvedFeatures - convolved features to pool (as given by cnnConvolve)
% convolvedFeatures(imageRow, imageCol, featureNum, imageNum)
%
% Returns:
% pooledFeatures - matrix of pooled features in the form
% pooledFeatures(poolRow, poolCol, featureNum, imageNum)
%
numImages = size(convolvedFeatures, 4);
numFilters = size(convolvedFeatures, 3);
convolvedDim = size(convolvedFeatures, 1);
pooledFeatures = zeros(convolvedDim / poolDim, ...
convolvedDim / poolDim, numFilters, numImages);
% Instructions:
% Now pool the convolved features in regions of poolDim x poolDim,
% to obtain the
% (convolvedDim/poolDim) x (convolvedDim/poolDim) x numFeatures x numImages
% matrix pooledFeatures, such that
% pooledFeatures(poolRow, poolCol, featureNum, imageNum) is the
% value of the featureNum feature for the imageNum image pooled over the
% corresponding (poolRow, poolCol) pooling region.
%
% Use mean pooling here.
%%% YOUR CODE HERE %%%
for imageNum = 1:numImages
for featureNum = 1:numFilters
featuremap = squeeze(convolvedFeatures(:,:,featureNum,imageNum));
pooledFeaturemap = conv2(featuremap,ones(poolDim)/(poolDim^2),'valid');
pooledFeatures(:,:,featureNum,imageNum) = pooledFeaturemap(1:poolDim:end,1:poolDim:end);
end
end
end
另外,附上知乎裏看見的CNN調參經驗:
著作權歸作者所有。
商業轉載請聯繫作者獲得授權,非商業轉載請註明出處。
作者:匿名用戶
鏈接:http://www.zhihu.com/question/27962483/answer/39038229
來源:知乎
新:看了前面有人推薦的博客,寫得灰常好,博主就是我推薦論文的二作。閒來無事,乾脆把要點用中文列在這。注意,販賣二手貨且夾帶有私貨,言語概括而誇張,進階學習請讀原文,讀代碼。
讓CNN跑起來,以下是調參的所有祕密
收集高質量標註數據。
輸入輸出數據做好歸一化,以防出現數值問題。方法就是主成分分析啥的。
參數初始化很重要。太小了,參數根本走不動。一般權重參數0.01均方差,0均值的高斯分佈是萬能的,不行就試更大的。偏差參數全0即可。
用SGD ,minibatch size 128。或者更小size ,但那樣吞吐量變小,計算效率變低。
用帶momentum的SGD,二階方法不用也罷。
梯度更新的步長很重要。一般0.1是個萬能數值。調參可改進結果,具體做法是人肉監督:用另外的驗證集觀察測試錯誤率,一旦不降了,步長減半甚至更多。
梯度歸一化:除以minibatch size ,這樣就不顯式依賴minibatch size
限制權重參數的最大值防止跑飛。一般最大行範數不超過2或者4,否則同比收縮到這個值。
梯度大致應該總是隻改變參數的千分之一,偏離這個數字太遠的,調之。
dropout一定要用
relu一定要用
用過這些了還不行,只好反省人品了…
原答案分割線
授人以魚不如授人與漁。cnn調參,最好的參考論文就是那篇nips2012用cnn做imagenet的,沒有之一。dropout那篇文章可作爲最佳補充。
樓上還有推薦trick of trade那書的…真心沒用,那書不涉及pre training的章節,除了lecun寫的第一篇,其它看完呵呵就好。