線性迴歸

損失函數：

優化函數-梯度下降：

在求數值解的優化算法中，小批量隨機梯度下降（mini-batch stochastic gradient descent）在深度學習中被廣泛使用。它的算法很簡單：先選取一組模型參數的初始值，如隨機選取；接下來對參數進行多次迭代，使每次迭代都可能降低損失函數的值。在每次迭代中，先隨機均勻採樣一個由固定數目訓練數據樣本所組成的小批量（mini-batch）BB，然後求小批量中數據樣本的平均損失有關模型參數的導數（梯度），最後用此結果與預先設定的一個正數的乘積作爲模型參數在本次迭代的減小量。

學習率: ηη代表在每次優化中，能夠學習的步長的大小
批量大小: BB是小批量計算中的批量大小batch size

優化函數的有以下兩個步驟：

(i)初始化模型參數，一般來說使用隨機初始化；
(ii)我們在數據上迭代多次，通過在負梯度方向移動參數來更新每個參數。

線性迴歸模型使用pytorch的簡潔實現：

import torch
from torch import nn
import numpy as np
import torch.utils.data as Data
from torch.nn import init
import torch.optim as optim

torch.manual_seed(1)

# 生成數據集
num_inputs = 2 # 即x1與x2兩個參數
num_examples = 1000 # 數據集樣本數，利用np.random.normal生成。

true_w = [2, -3.4] # 真實值，最後預測值應該無線靠近它
true_b = 4.2  #真實值
##生成1000行兩列的特徵數據，這兩列，第一列當作x1,第二列當作x2
features = torch.tensor(np.random.normal(0, 1, (num_examples, num_inputs)), dtype=torch.float)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b # 得到1000個數的tensor
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)

# 讀取數據
batch_size = 10
# combine featues and labels of dataset
dataset = Data.TensorDataset(features, labels)

# put dataset into DataLoader
data_iter = Data.DataLoader(
    dataset=dataset,            # torch TensorDataset format
    batch_size=batch_size,      # mini batch size
    shuffle=True,               # whether shuffle the data or not
    num_workers=2,              # read data in multithreading
)

# 定義模型
class LinearNet(nn.Module):
    def __init__(self, n_feature):
        super(LinearNet, self).__init__()      # call father function to init 
        self.linear = nn.Linear(n_feature, 1)  # nn.Linear表示的是 y=w*x b，分佈表示的是x是n_featue維，y是1維

    def forward(self, x):
        y = self.linear(x)
        return y
    
net = LinearNet(num_inputs)

# ways to init a multilayer network
# method one
net = nn.Sequential(
    nn.Linear(num_inputs, 1)
    # other layers can be added here
    )

# method two
net = nn.Sequential()
net.add_module('linear', nn.Linear(num_inputs, 1))
# net.add_module ......

# method three
from collections import OrderedDict
net = nn.Sequential(OrderedDict([
          ('linear', nn.Linear(num_inputs, 1))
          # ......
        ]))

 # 初始化模型參數

init.normal_(net[0].weight, mean=0.0, std=0.01)
init.constant_(net[0].bias, val=0.0)  # or you can use `net[0].bias.data.fill_(0)` to modify it directly

# 定義損失函數
loss = nn.MSELoss()  

# 定義優化函數
optimizer = optim.SGD(net.parameters(), lr=0.03)   # built-in random gradient descent function
print(optimizer)  # function prototype: `torch.optim.SGD(params, lr=, momentum=0, dampening=0, weight_decay=0, nesterov=False)`

# 訓練
num_epochs = 3 #迭代次數
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        output = net(X) #前向傳播
        l = loss(output, y.view(-1, 1)) #計算loss
        optimizer.zero_grad() # 梯度歸零
        l.backward() # 反向傳播
        optimizer.step() # 更新參數
    print('epoch %d, loss: %f' % (epoch, l.item()))

# 結果比較
# result comparision
dense = net[0]
print(true_w, dense.weight.data)  # [2, -3.4] tensor([[ 2.0008, -3.4007]])
print(true_b, dense.bias.data)    # 4.2 tensor([4.2006])

softmax和分類模型

softmax基本概念：

softmax迴歸與線性迴歸相同，都是將輸入特徵與權重做線性疊加，，其輸出層也是一個全連接層。與線性迴歸的最大不同在於：Softmaxso迴歸的輸出值個數等於標籤中的類別數；

softmax迴歸對樣本ii分類的矢量計算表達式爲：

總結：

Softmax迴歸適用於分類問題，使用Softmax運算輸出類別的概率分佈；
Softmax迴歸是一個單層神經網絡，輸出個數等於分類問題中的類別個數；
交叉熵用於衡量兩個概率分佈的差異；

softmax迴歸模型使用pytorch的簡潔實現：

# 加載各種包或者模塊
import torch
from torch import nn
from torch.nn import init
import numpy as np
import sys
sys.path.append("/home/kesci/input")
import d2lzh1981 as d2l

# 初始化參數和獲取數據
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

# 定義網絡模型
num_inputs = 784
num_outputs = 10

class LinearNet(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(num_inputs, num_outputs)
    def forward(self, x): # x 的形狀: (batch, 1, 28, 28)
        y = self.linear(x.view(x.shape[0], -1))
        return y
    
# net = LinearNet(num_inputs, num_outputs)

class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x): # x 的形狀: (batch, *, *, ...)
        return x.view(x.shape[0], -1)

from collections import OrderedDict
net = nn.Sequential(
        # FlattenLayer(),
        # LinearNet(num_inputs, num_outputs) 
        OrderedDict([
           ('flatten', FlattenLayer()),
           ('linear', nn.Linear(num_inputs, num_outputs))]) # 或者寫成我們自己定義的 LinearNet(num_inputs, num_outputs) 也可以
        )

# 初始化模型參數
init.normal_(net.linear.weight, mean=0, std=0.01)
init.constant_(net.linear.bias, val=0)

# 定義損失函數
loss = nn.CrossEntropyLoss() # 下面是他的函數原型
# class torch.nn.CrossEntropyLoss(weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean')

# 定義優化函數
optimizer = torch.optim.SGD(net.parameters(), lr=0.1) # 下面是函數原型
# class torch.optim.SGD(params, lr=, momentum=0, dampening=0, weight_decay=0, nesterov=False)

# 訓練
num_epochs = 5
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)