Neural Networks code

Marc'Aurelio Ranzato (Google)’s Neural Networks code.

Deep Learning Methods fo Vision

CVPR 2012 Tutorial

http://cs.nyu.edu/~fergus/tutorials/deep_learning_cvpr12/

COMPUTE ERROR
BACKPROP

% Toy demo about neural nets showing: 1) the input/output mapping
% as a function of the number of layers, 2) the input/output mapping
% as a function of the number of hidden units and 3) a simple example
% of training for regression.
%
% Usage: from Matlab command line type:
%        demo_nnet
%
% Marc'Aurelio Ranzato
% 27 May 2012
% [email protected]

%choice=3; %you want test which demo(1-3)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%if(choice==1)
% DEMO #1
fprintf('This demo shows input output mapping of')
fprintf(' neural nets with one input and one output.\n')
fprintf('We will vary the number of hidden layers.\n')
fprintf('We keep the number of hidden units to 100.\n')
clear
randn('seed',7)%設置隨機種子
number_of_hidden_layers = [1 2 3];
number_of_hidden_units = 100;
input = -20:.01:20; % 1 dimensional input

figure(1); clf;%Clear current figure window
plot_style = {'-b','-.r','--g',':k'};
for num_hid = 1 : length(number_of_hidden_layers)%每次迭代增加一個100節點的隱層
    size_of_layers = number_of_hidden_units * ...
        ones(1,number_of_hidden_layers(num_hid));%[1] [1,1] [1,1,1]
    size_of_layers = [size_of_layers 1]; % Add size of output
    size_of_layers = [1 size_of_layers]; % Add size of input
    fprintf('Generating parameters at random\n')
    for ll = 1 : length(size_of_layers) - 1%初始化W,b
        W{ll} = .2*randn(size_of_layers(ll+1), size_of_layers(ll));
        b{ll} = zeros(size_of_layers(ll+1),1); % with 0 biases, output is symmetric
    end
    fprintf('FPROP: computing the ouput values\n')
    h{1} = tanhAct(bsxfun(@plus, W{1}*input, b{1}));% Hyperbolic tangent non-linearity.
    for ll = 2 : length(size_of_layers) - 1
        h{ll} = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
    end
    figure(1);
    hold on
    plot(input, h{length(size_of_layers)-1},plot_style{num_hid}, ...
        'LineWidth',4)
    hold off
end
h = legend('1 hidden layer','2 hidden layers','3 hidden layers', ...
    'Location','SouthEast');
set(h,'FontSize',14,'FontWeight','bold')
grid on
xlabel('input','FontSize',16)
ylabel('output','FontSize',16)
saveas(1,'input_output_varying_num_layers.png')

%form this picture,we can see that when we add the hiden layers, the
%model's non-linearity power grower.It have been proved that if the number
% of layer is adequate, the model can approxmate arbitrary function.
%end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%if (choice==2)
% DEMO #2
fprintf('This demo shows input output mapping of')
fprintf(' neural nets with one input and one output.\n')
fprintf('We will vary the number of hidden units.\n')
fprintf('We keep the number of hidden layers to 3.\n')
clear
randn('seed',8)
number_of_hidden_layers = 3;
number_of_hidden_units = [10 100 1000];
input = -20:.01:20; % 1 dimensional input

figure(2); clf;
plot_style = {'-b','-.r','--g',':k'};
for num_hid = 1 : length(number_of_hidden_units)
    size_of_layers = number_of_hidden_units(num_hid) * ...
        ones(1,number_of_hidden_layers);
    size_of_layers = [size_of_layers 1]; % Add size of output
    size_of_layers = [1 size_of_layers]; % Add size of input
    fprintf('Generating parameters at random\n')
    fprintf('We also set biases at random making the mapping')
    fprintf(' almost certainly not (anti-)symmetric\n')
    for ll = 1 : length(size_of_layers) - 1
        W{ll} = .2*randn(size_of_layers(ll+1), size_of_layers(ll));
        b{ll} = .1*randn(size_of_layers(ll+1),1);%隨機設置偏置
    end
    fprintf('FPROP: computing the ouput values\n')
    h{1} = tanhAct(bsxfun(@plus, W{1}*input, b{1}));
    for ll = 2 : length(size_of_layers) - 1
        h{ll} = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));%h{length(size_of_layers)-1}指的是網絡最後一層的輸出
    end
    figure(2);
    hold on
    plot(input, h{length(size_of_layers)-1},plot_style{num_hid}, ...
        'LineWidth',4)
    hold off
end
hndl = legend('10 hiddens','100 hiddens','1000 hiddens', ...
    'Location','NorthWest');%西北角
set(hndl,'FontSize',14,'FontWeight','bold')
grid on
xlabel('input','FontSize',16)
ylabel('output','FontSize',16)
saveas(2,'input_output_varying_num_hiddens.png')
%end
%form this picture,we also can see that when we add the number of hiden layers, the
%model's non-linearity power grower.It have been proved that if the number
% of hiden layer's nodes is adequate, the model can approxmate arbitrary function.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%if (choice==3)
% DEMO #3
fprintf('This demo shows a simple example of training')
fprintf(' a neural net to perform a  desired input/output mapping')
fprintf(', a task known as regression.\n')
fprintf('The output is a piece of cosine function.\n')
fprintf('The optizer is stochastic gradient descent.\n')%使用隨機梯度下降法
clear
randn('seed',123)
number_of_hidden_layers = 3;
number_of_hidden_units = 1000;
input = -5:.01:5; % 1 dimensional input
target = cos(input + pi/3); % desired output
fprintf('Shuffling data\n')%混排數據，保證輸入和輸出數據的隨機性
pp = randperm(length(input));
input = input(pp); % Shuffle data.
target = target(pp);
fprintf('Using stochastic gradient descent as optimizer\n.')
learning_rate = 0.001;
number_of_epochs = 100;
mini_batch_size = 100;
num_batches = floor(size(input,2)/mini_batch_size);%下取整
fprintf('Number of sweeps over the whole data： %d\n', number_of_epochs)
fprintf('Size of mini-batch： %d\n', mini_batch_size)
figure(3); clf;
plot_style = {'+b','or','dg','^k'};
plot(input, target, plot_style{end}, 'LineWidth',4)
fprintf('Using %d hidden layer neural net with %d hidden units.\n', ...
    number_of_hidden_layers, number_of_hidden_units)
size_of_layers = number_of_hidden_units * ...
    ones(1,number_of_hidden_layers);
size_of_layers = [size_of_layers 1]; % Add size of output
size_of_layers = [1 size_of_layers]; % Add size of input
fprintf('Generating parameters at random\n')
for ll = 1 : length(size_of_layers) - 1
    W{ll} = .2*randn(size_of_layers(ll+1), size_of_layers(ll));
    b{ll} = zeros(size_of_layers(ll+1),1);%初始零偏置
    Wgrad{ll} = zeros(size(W{ll})); % Initialize gradient slots（插槽）.
    bgrad{ll} = zeros(size(b{ll}));
end
% PLot initial prediction
h{1} = tanhAct(bsxfun(@plus, W{1}*input, b{1}));
for ll = 2 : length(size_of_layers) - 2
    h{ll} = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
end
h{ll+1} = bsxfun(@plus, W{ll+1}*h{ll}, b{ll+1}); % prediction 最後一層並未使用非線性函數，這個視目標輸出的範圍所定
hold on
plot(input, h{ll+1}, plot_style{1},'LineWidth',4);
hold off
fprintf('Initial error %g\n', sum(sum( (h{ll+1}-input).^2 )) / length(input));
fprintf('Start training!\n')
for ee = 1 : number_of_epochs
    fprintf('Epoch %d: ', ee)
    error = 0;
    tic;
    for bb = 1 : num_batches

        % Get current mini-batch.
        in = input(:, 1 + mini_batch_size * (bb - 1) : mini_batch_size * bb);
        desired = target(:, 1 + mini_batch_size * (bb-1) : mini_batch_size * bb);
        % FPROP
        [h{1} dh{1}] = tanhAct(bsxfun(@plus, W{1}*in, b{1}));% dh{1}[optional] are the derivatives of the output w.r.t. input
        for ll = 2 : length(size_of_layers) - 2
            [h{ll} dh{ll}] = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
        end
        h{ll+1} = bsxfun(@plus, W{ll+1}*h{ll}, b{ll+1}); % prediction

COMPUTE ERROR

        difference = h{ll+1} - desired;
        error = error + 0.5 * sum(sum(difference.^2));
        derivative{ll+1} = difference;

BACKPROP

        for ll = length(size_of_layers) - 1 : -1 : 2
            % Compute derivative w.r.t. parameters
            Wgrad{ll} = derivative{ll} * h{ll-1}';
            bgrad{ll} = sum(derivative{ll},2);%關於列（樣本數）求和
            % Compute derivative w.r.t. input of this layer
            derivative{ll-1} = (W{ll}' * derivative{ll}) .* dh{ll-1};
        end
        Wgrad{1} = derivative{1} * in';
        bgrad{1} = sum(derivative{1},2);
        % update parameters 關於每一個mini-batch 進行（隨機）梯度下降
        for ll = 1 : length(size_of_layers) - 1
            W{ll} = W{ll} - (learning_rate / mini_batch_size) * Wgrad{ll};
            b{ll} = b{ll} - (learning_rate / mini_batch_size) * bgrad{ll};
        end

    end
    timing(ee) = toc;
    fprintf('Average error %g\n', error / (mini_batch_size * num_batches))
    errors(ee) = error;
    % Plot predictions at first and last epoch.
    if (ee == 1) || (ee == number_of_epochs)
        h{1} = tanhAct(bsxfun(@plus, W{1}*input, b{1}));
        for ll = 2 : length(size_of_layers) - 2
            h{ll} = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
        end
        h{ll+1} = bsxfun(@plus, W{ll+1}*h{ll}, b{ll+1}); % prediction
        hold on
        if (ee == 1)
            plot(input, h{ll+1}, plot_style{2},'LineWidth',4);
        else
            plot(input, h{ll+1}, plot_style{3},'LineWidth',4);
        end
        hold off
    end
end
grid on
hndl = legend('Target','Before training', 'After 1 epoch', ...
    'At the end of training', 'Location', 'NorthWest');
set(hndl, 'FontSize', 14, 'FontWeight', 'bold')
grid on
xlabel('input','FontSize',16)
ylabel('output','FontSize',16)
saveas(3,'demo_regression.png')
%end
%足夠的隱層數，隱層節點數，充分的迭代步數（eg，隨機梯度下降），神經網絡模型可以以
%任意精度逼近目標函數。但是傳統的ANN模型以及BP算法存在難以克服的缺點，比如深度限
%制，輸入數據維數限制等（詳見DL&UFL by NG）。

This demo shows input output mapping of neural nets with one input and one output.
We will vary the number of hidden layers.
We keep the number of hidden units to 100.
Generating parameters at random
FPROP: computing the ouput values
Generating parameters at random
FPROP: computing the ouput values
Generating parameters at random
FPROP: computing the ouput values
This demo shows input output mapping of neural nets with one input and one output.
We will vary the number of hidden units.
We keep the number of hidden layers to 3.
Generating parameters at random
We also set biases at random making the mapping almost certainly not (anti-)symmetric
FPROP: computing the ouput values
Generating parameters at random
We also set biases at random making the mapping almost certainly not (anti-)symmetric
FPROP: computing the ouput values
Generating parameters at random
We also set biases at random making the mapping almost certainly not (anti-)symmetric
FPROP: computing the ouput values
This demo shows a simple example of training a neural net to perform a  desired input/output mapping, a task known as regression.
The output is a piece of cosine function.
The optizer is stochastic gradient descent.
Shuffling data
Using stochastic gradient descent as optimizer
.Number of sweeps over the whole data： 100
Size of mini-batch： 100
Using 3 hidden layer neural net with 1000 hidden units.
Generating parameters at random
Initial error 6.65085
Start training!
Epoch 1: Average error 76.0816
Epoch 2: Average error 0.114674
Epoch 3: Average error 0.0738014
Epoch 4: Average error 0.0520657
Epoch 5: Average error 0.0384725
Epoch 6: Average error 0.0297563
Epoch 7: Average error 0.0240331
Epoch 8: Average error 0.0201614
Epoch 9: Average error 0.0174473
Epoch 10: Average error 0.0154716
Epoch 11: Average error 0.0139804
Epoch 12: Average error 0.0128181
Epoch 13: Average error 0.0118873
Epoch 14: Average error 0.0111246
Epoch 15: Average error 0.0104878
Epoch 16: Average error 0.00994748
Epoch 17: Average error 0.0094828
Epoch 18: Average error 0.0090784
Epoch 19: Average error 0.00872281
Epoch 20: Average error 0.00840724
Epoch 21: Average error 0.00812491
Epoch 22: Average error 0.00787045
Epoch 23: Average error 0.0076396
Epoch 24: Average error 0.00742893
Epoch 25: Average error 0.00723564
Epoch 26: Average error 0.00705746
Epoch 27: Average error 0.00689247
Epoch 28: Average error 0.00673911
Epoch 29: Average error 0.00659603
Epoch 30: Average error 0.00646211
Epoch 31: Average error 0.00633639
Epoch 32: Average error 0.00621804
Epoch 33: Average error 0.00610634
Epoch 34: Average error 0.00600067
Epoch 35: Average error 0.00590048
Epoch 36: Average error 0.00580529
Epoch 37: Average error 0.00571467
Epoch 38: Average error 0.00562825
Epoch 39: Average error 0.00554567
Epoch 40: Average error 0.00546664
Epoch 41: Average error 0.00539089
Epoch 42: Average error 0.00531815
Epoch 43: Average error 0.00524821
Epoch 44: Average error 0.00518086
Epoch 45: Average error 0.00511592
Epoch 46: Average error 0.00505321
Epoch 47: Average error 0.00499258
Epoch 48: Average error 0.00493389
Epoch 49: Average error 0.004877
Epoch 50: Average error 0.0048218
Epoch 51: Average error 0.00476817
Epoch 52: Average error 0.00471603
Epoch 53: Average error 0.00466526
Epoch 54: Average error 0.0046158
Epoch 55: Average error 0.00456755
Epoch 56: Average error 0.00452046
Epoch 57: Average error 0.00447444
Epoch 58: Average error 0.00442944
Epoch 59: Average error 0.0043854
Epoch 60: Average error 0.00434228
Epoch 61: Average error 0.00430001
Epoch 62: Average error 0.00425856
Epoch 63: Average error 0.00421789
Epoch 64: Average error 0.00417795
Epoch 65: Average error 0.00413872
Epoch 66: Average error 0.00410015
Epoch 67: Average error 0.00406221
Epoch 68: Average error 0.00402489
Epoch 69: Average error 0.00398814
Epoch 70: Average error 0.00395195
Epoch 71: Average error 0.00391629
Epoch 72: Average error 0.00388115
Epoch 73: Average error 0.00384649
Epoch 74: Average error 0.00381231
Epoch 75: Average error 0.00377858
Epoch 76: Average error 0.00374529
Epoch 77: Average error 0.00371242
Epoch 78: Average error 0.00367996
Epoch 79: Average error 0.00364789
Epoch 80: Average error 0.0036162
Epoch 81: Average error 0.00358488
Epoch 82: Average error 0.00355391
Epoch 83: Average error 0.00352329
Epoch 84: Average error 0.00349301
Epoch 85: Average error 0.00346304
Epoch 86: Average error 0.0034334
Epoch 87: Average error 0.00340406
Epoch 88: Average error 0.00337501
Epoch 89: Average error 0.00334626
Epoch 90: Average error 0.00331779
Epoch 91: Average error 0.00328959
Epoch 92: Average error 0.00326166
Epoch 93: Average error 0.00323399
Epoch 94: Average error 0.00320657
Epoch 95: Average error 0.00317941
Epoch 96: Average error 0.00315248
Epoch 97: Average error 0.0031258
Epoch 98: Average error 0.00309934
Epoch 99: Average error 0.00307311
Epoch 100: Average error 0.00304711

function [output input_dx] = tanhAct(input)
% Hyperbolic tangent non-linearity.
% input is the input value (it can also be a vector or a matrix).
% output is the output value
% input_dx [optional] are the derivatives of the output w.r.t. input
%
% 1 June 2012
% Marc'Aurelio Ranzato
% [email protected]

output = tanh(input);
if nargout > 1
    input_dx = (1-output) .* (1+output);
end