單層卷積神經網絡的實現

這兩週主要把DeepLearnToolbox-master裏的源碼研讀了下,大致框架基本理解,但是細節實現處還不是很懂。經過與同學的討論,基本理解了。然後嘗試實現一個卷積神經網絡,參考了UFLDL教程與習題代碼和網上的一些資源,嘗試寫了個一層的網絡,即輸入層+卷積層+池化層+全連接層。調試了很久,還有點問題。
由於在 MNIST 數字識別中,有 個不同的類別。所以採用softmax迴歸。其代價函數爲:

這裏寫圖片描述

詳細的可參考ufldl softmax迴歸

下面爲實現的代碼:

my_cnn.m:

%%STEP0: 加載數據集與初始化參數

imageDim=28;      %圖像的尺寸
numClasses=10;  %標籤的類別,MNIST數據集一共10類
filterDim=9;    %濾波器核的尺寸
numFilters=20;  %濾波器的個數,也是特徵數
poolDim=2;      %池化層核的尺寸

%加載數據集
%訓練數據集
addpath ./commom/;
images=loadMNISTImages('./common/trian-images');
images=reshape(images,imageDim,imageDim,[]);
labels = loadMNISTLabels('./common/train-labels');

%測試數據集
testImages = loadMNISTImages('./common/t10k-images');
testImages = reshape(testImages,imageDim,imageDim,[]);
testLabels = loadMNISTLabels('./common/t10k-labels');

%初始化參數
%參數初始化比較重要,權重值一般初始化爲均值爲零方差爲0.1的高斯分佈,偏置值一般初始化爲零即可
%卷積層的權重與偏置參數
Wc=1e-1*randn(filterDim,filterDim,numFilters);
bc = zeros(numFilters, 1);

%卷積層的輸出圖像的尺寸
outDim = imageDim - filterDim + 1; %
%池化層輸出圖像的尺寸
outDim=outDim/poolDim;
%隱藏層的尺寸
hiddenSize=outDim^2*numFilters;

%初始化全連接層的權重與偏置
r  = sqrt(6) / sqrt(numClasses+hiddenSize+1);
Wd = rand(numClasses, hiddenSize) * 2 * r - r;
bd = zeros(numClasses, 1);

%% STEP 1: 學習參數
epochs = 3;       %將訓練集分爲3批
alpha = 1e-1;
minibatch = 256;

% 設置梯度下降相關的參數

Wc_velocity = zeros(size(Wc));
Wd_velocity = zeros(size(Wd));
bc_velocity = zeros(size(bc));
bd_velocity = zeros(size(bd));

m = length(labels);

%循環處理圖像
it = 0;
for e = 1:epochs

    % 隨機改變序列以快速採樣minibatch
    rp = randperm(m);

    for s=1:minibatch:(m-minibatch+1)
        it = it + 1;

        % 隨機選擇下一個要處理的minibatch
        mb_data = images(:,:,rp(s:s+minibatch-1));
        mb_labels = labels(rp(s:s+minibatch-1));

        % 
        numImages = length(mb_labels);
        convDim = imageDim-filterDim+1; % dimension of convolved output
        outputDim = (convDim)/poolDim; % dimension of subsampled output

        %前向傳導
        activations = cnnConvolve(filterDim, numFilters, mb_data, Wc, bc);%sigmoid(wx+b)
        activationsPooled = cnnPool(poolDim, activations);
        activationsPooled = reshape(activationsPooled,[],numImages);
        h = exp(bsxfun(@plus,Wd * activationsPooled,bd));     %softmax的cost function裏的h和probs
        probs = bsxfun(@rdivide,h,sum(h,1));

        %計算 Cost Function
        logp = log(probs);
        index = sub2ind(size(logp),mb_labels',1:size(probs,2));
        ceCost = -sum(logp(index));
        wCost = lambda/2 * (sum(Wd(:).^2)+sum(Wc(:).^2));
        cost = ceCost/numImages + wCost;

        %反向傳播
        output = zeros(size(probs));
        output(index) = 1;
        DeltaSoftmax = probs - output; %輸出層的靈敏度

        DeltaPool = reshape(Wd' * DeltaSoftmax,outputDim,outputDim,numFilters,numImages);
        DeltaUnpool = zeros(convDim,convDim,numFilters,numImages);

        %,求池化層的靈敏度,用kron函數實現上採樣
        for imNum = 1:numImages
            for FilterNum = 1:numFilters
                unpool = DeltaPool(:,:,FilterNum,imNum);
                DeltaUnpool(:,:,FilterNum,imNum) = kron(unpool,ones(poolDim))./(poolDim ^ 2);
            end
        end

        DeltaConv = DeltaUnpool .* activations .* (1 - activations); %卷積層的靈敏度

        %計算梯度
        Wd_grad = (1./numImages) .* DeltaSoftmax*activationsPooled'+lambda*Wd;
        bd_grad = (1./numImages) .* sum(DeltaSoftmax,2);

        bc_grad = zeros(size(bc));
        Wc_grad = zeros(filterDim,filterDim,numFilters);

        for filterNum = 1:numFilters
            error = DeltaConv(:,:,filterNum,:);
            bc_grad(filterNum) = (1./numImages) .* sum(error(:));
        end

        %卷積層的靈敏度
        for filterNum = 1:numFilters
            for imNum = 1:numImages
                error = DeltaConv(:,:,filterNum,imNum);
                DeltaConv(:,:,filterNum,imNum) = rot90(error,2);
            end
        end

        for filterNum = 1:numFilters
            for imNum = 1:numImages
                Wc_grad(:,:,filterNum) = Wc_grad(:,:,filterNum) + conv2(mb_data(:,:,imNum),DeltaConv(:,:,filterNum,imNum),'valid');
            end
        end
        Wc_grad = (1./numImages) .* Wc_grad + lambda*Wc;

        %% STEP2 更新參數
        %%% YOUR CODE HERE %%%
        Wc_velocity = mom*Wc_velocity+alpha*Wc_grad;
        Wc = Wc - Wc_velocity;
        Wd_velocity = mom*Wd_velocity+alpha*Wd_grad;
        Wd = Wd - Wd_velocity;
        bc_velocity = mom*bc_velocity+alpha*bc_grad;
        bc = bc - bc_velocity;
        bd_velocity = mom*bd_velocity+alpha*bd_grad;
        bd = bd - bd_velocity;
        fprintf('Epoch %d: Cost on iteration %d is %f\n',e,it,cost);
  end;

    %
    alpha = alpha/2.0;

    %test at each epoch
    activations = cnnConvolve(filterDim, numFilters, testImages, Wc, bc);   %sigmoid(wx+b)
    activationsPooled = cnnPool(poolDim, activations);
    activationsPooled = reshape(activationsPooled,[],length(testLabels));
    h = exp(bsxfun(@plus,Wd * activationsPooled,bd));
    probs = bsxfun(@rdivide,h,sum(h,1));
    [~,preds] = max(probs,[],1);
    preds = preds';
    acc = sum(preds==testLabels)/length(preds);
    fprintf('Accuracy is %f\n',acc);
end;



cnnConvolve.m

function convolvedFeatures = cnnConvolve(filterDim, numFilters, images, W, b)
%cnnConvolve Returns the convolution of the features given by W and b with
%the given images
%
% Parameters:
%  filterDim - filter (feature) dimension
%  numFilters - number of feature maps
%  images - large images to convolve with, matrix in the form
%           images(r, c, image number)
%  W, b - W, b for features from the sparse autoencoder
%         W is of shape (filterDim,filterDim,numFilters)
%         b is of shape (numFilters,1)
%
% Returns:
%  convolvedFeatures - matrix of convolved features in the form
%                      convolvedFeatures(imageRow, imageCol, featureNum, imageNum)

numImages = size(images, 3);
imageDim = size(images, 1);
convDim = imageDim - filterDim + 1;

convolvedFeatures = zeros(convDim, convDim, numFilters, numImages);

% Instructions:
%   Convolve every filter with every image here to produce the 
%   (imageDim - filterDim + 1) x (imageDim - filterDim + 1) x numFeatures x numImages
%   matrix convolvedFeatures, such that 
%   convolvedFeatures(imageRow, imageCol, featureNum, imageNum) is the
%   value of the convolved featureNum feature for the imageNum image over
%   the region (imageRow, imageCol) to (imageRow + filterDim - 1, imageCol + filterDim - 1)
%
% Expected running times: 
%   Convolving with 100 images should take less than 30 seconds 
%   Convolving with 5000 images should take around 2 minutes
%   (So to save time when testing, you should convolve with less images, as
%   described earlier)


for imageNum = 1:numImages
  for filterNum = 1:numFilters

    % convolution of image with feature matrix
    convolvedImage = zeros(convDim, convDim);

    % Obtain the feature (filterDim x filterDim) needed during the convolution

    %%% YOUR CODE HERE %%%
    filter = squeeze(W(:,:,filterNum));
    % Flip the feature matrix because of the definition of convolution, as explained later
    filter = rot90(squeeze(filter),2);

    % Obtain the image
    im = squeeze(images(:, :, imageNum));

    % Convolve "filter" with "im", adding the result to convolvedImage
    % be sure to do a 'valid' convolution

    %%% YOUR CODE HERE %%%
    convolvedImage = conv2(im,filter,'valid');
    % Add the bias unit
    % Then, apply the sigmoid function to get the hidden activation

    %%% YOUR CODE HERE %%%
    convolvedImage = bsxfun(@plus,convolvedImage,b(filterNum));
    convolvedImage = 1 ./ (1+exp(-convolvedImage));

    convolvedFeatures(:, :, filterNum, imageNum) = convolvedImage;
  end
end


end

cnnPool.m

function pooledFeatures = cnnPool(poolDim, convolvedFeatures)
%cnnPool Pools the given convolved features
%
% Parameters:
%  poolDim - dimension of pooling region
%  convolvedFeatures - convolved features to pool (as given by cnnConvolve)
%                      convolvedFeatures(imageRow, imageCol, featureNum, imageNum)
%
% Returns:
%  pooledFeatures - matrix of pooled features in the form
%                   pooledFeatures(poolRow, poolCol, featureNum, imageNum)
%     

numImages = size(convolvedFeatures, 4);
numFilters = size(convolvedFeatures, 3);
convolvedDim = size(convolvedFeatures, 1);

pooledFeatures = zeros(convolvedDim / poolDim, ...
        convolvedDim / poolDim, numFilters, numImages);

% Instructions:
%   Now pool the convolved features in regions of poolDim x poolDim,
%   to obtain the 
%   (convolvedDim/poolDim) x (convolvedDim/poolDim) x numFeatures x numImages 
%   matrix pooledFeatures, such that
%   pooledFeatures(poolRow, poolCol, featureNum, imageNum) is the 
%   value of the featureNum feature for the imageNum image pooled over the
%   corresponding (poolRow, poolCol) pooling region. 
%   
%   Use mean pooling here.

%%% YOUR CODE HERE %%%
    for imageNum = 1:numImages
        for featureNum = 1:numFilters
            featuremap = squeeze(convolvedFeatures(:,:,featureNum,imageNum));
            pooledFeaturemap = conv2(featuremap,ones(poolDim)/(poolDim^2),'valid');
            pooledFeatures(:,:,featureNum,imageNum) = pooledFeaturemap(1:poolDim:end,1:poolDim:end);
        end
    end
end

另外,附上知乎裏看見的CNN調參經驗:

著作權歸作者所有。
商業轉載請聯繫作者獲得授權,非商業轉載請註明出處。
作者:匿名用戶
鏈接:http://www.zhihu.com/question/27962483/answer/39038229
來源:知乎

新:看了前面有人推薦的博客,寫得灰常好,博主就是我推薦論文的二作。閒來無事,乾脆把要點用中文列在這。注意,販賣二手貨且夾帶有私貨,言語概括而誇張,進階學習請讀原文,讀代碼。

讓CNN跑起來,以下是調參的所有祕密

  • 收集高質量標註數據。

  • 輸入輸出數據做好歸一化,以防出現數值問題。方法就是主成分分析啥的。

  • 參數初始化很重要。太小了,參數根本走不動。一般權重參數0.01均方差,0均值的高斯分佈是萬能的,不行就試更大的。偏差參數全0即可。

  • 用SGD ,minibatch size 128。或者更小size ,但那樣吞吐量變小,計算效率變低。

  • 用帶momentum的SGD,二階方法不用也罷。

  • 梯度更新的步長很重要。一般0.1是個萬能數值。調參可改進結果,具體做法是人肉監督:用另外的驗證集觀察測試錯誤率,一旦不降了,步長減半甚至更多。

  • 梯度歸一化:除以minibatch size ,這樣就不顯式依賴minibatch size

  • 限制權重參數的最大值防止跑飛。一般最大行範數不超過2或者4,否則同比收縮到這個值。

  • 梯度大致應該總是隻改變參數的千分之一,偏離這個數字太遠的,調之。

  • dropout一定要用

  • relu一定要用

用過這些了還不行,只好反省人品了…

原答案分割線

授人以魚不如授人與漁。cnn調參,最好的參考論文就是那篇nips2012用cnn做imagenet的,沒有之一。dropout那篇文章可作爲最佳補充。

樓上還有推薦trick of trade那書的…真心沒用,那書不涉及pre training的章節,除了lecun寫的第一篇,其它看完呵呵就好。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章