使用梯度下降的方法進行邏輯迴歸實戰：

問題說明：
這裏將建立一個邏輯迴歸模型來預測一個學生是否被大學錄取。
假設你是一個大學的管理員，你想根據兩次考試的結果來決定每個申請人的錄取機會，你有以前的申請人的歷史數據。可以用歷史數據作爲邏輯迴歸的訓練集。對於每一個樣本,有兩次考試的申請人的成績和錄取決定。建立一個分類模型，根據考試成績估計入學概率。

數據鏈接：
鏈接:https://pan.baidu.com/s/1-pjwe1ogk30WpzN4Qg1NZA 密碼:wqmt

完整代碼實現如下：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pdData = pd.read_csv('/Users/hxx/Downloads/LogiReg_data.txt', header=None, names=['Exam1', 'Exam2', 'Admitted'], engine='python')
#header: 指定第幾行作爲列名(忽略註解行)，如果沒有指定列名，默認header=0; 如果指定了列名header=None
#names 指定列名，如果文件中不包含header的行，應該顯性表示header=None。這邊列名爲'Exam1', 'Exam2', 'Admitted'三列
print(pdData.head())#head()函數默認讀取前五行
print(pdData.shape)#數據文件100行3列
positive = pdData[pdData['Admitted']==1]
negative = pdData[pdData['Admitted']==0]
plt.figure(figsize=(10,5))#設置畫布
plt.scatter(positive['Exam1'], positive['Exam2'], c='b', marker='o', label='Admitted')#繪製散點圖positive的點，Exam1和Exam2組成一個點
plt.scatter(negative['Exam1'], negative['Exam2'], c='r', marker='x', label='Not Admitted')#繪製散點圖negative的點
plt.legend()# 添加圖例（也就是圖中右上角positive和negative的解釋）
plt.xlabel('Exam1 Score')#添加x軸標籤
plt.ylabel('Exam2 Score')#添加y軸標籤
plt.show()

#定義sigmoid函數
def sigmoid(z):
    return 1/(1+np.exp(-z))


nums = np.arange(-10,10,step=1)#隨機取-10到10之間數值步長爲1
plt.figure(figsize=(12,4))#設置畫布
plt.plot(nums,sigmoid(nums),c='r')#sigmoid函數畫圖展示，nums表示x，sigmoid（nums）表示的y，x和y確定一點組成sigmoid函數圖像
plt.show()

#定義模型h(x)
def model(X,theta):
    return sigmoid(np.dot(X,theta.T))#將theta轉置與x相乘，在代入sigmoid函數中得到模型。X是100*3列，theta是一行三列

#取數據
pdData.insert(0,'ones',1) #給數據添加一列，列名爲ones，在第0列添加，數值爲1，也就相當於x_0的係數theta_0
orig_data = pdData .as_matrix() # 將數據轉變成矩陣形式
cols = orig_data .shape[1]#shape[0]就是讀取矩陣第一維度的長度，相當於行數；shape[1]就是讀取矩陣第二維度的長度，相當於列數，shape[1]=4
X = orig_data[:,0:cols-1]#逗號前冒號表示所有值，0:cols-1表示取第0列到（clos-1）列不包括cols-1這一列
y = orig_data[:,cols-1:cols]
theta = np.zeros([1,3])#這邊設置theta值均爲0，一行三列和x的一行三列相對應
print(X)
print(y)
print(theta)

#定義損失函數
#這邊就是邏輯迴歸中的損失函數，邏輯迴歸詳情可以參考：https://blog.csdn.net/hxx123520/article/details/104313032第四部分詳細說明
def cost(X,y,theta):#對應於J(theta)
    left = np.multiply(-y,np.log(model(X,theta)))#np.multiply對應位置相乘
    right = np.multiply(1-y,np.log(1-model(X,theta)))
    return np.sum(left-right)/(len(X))
print(cost(X,y,theta))

#根據梯度計算公式計算梯度
#也就是損失函數對theta求偏導，得到的就是梯度
def gradient(X,y,theta):
    grad = np.zeros(theta.shape)  # 定義初始化梯度,一行三列
    #print(grad.shape)
    error = (model(X, theta) - y).ravel()# 一行一百列。error=（預測y-真實值y）
    #print(error)
    # print(len(theta .ravel()))#3
    for j in range(len(theta.ravel())):
        term = np.multiply(error, X[:, j])#x[:,j] 取第j列的所有x樣本
        grad[0, j] = np.sum(term) / len(X)#這邊的grad就是一行三列。grad[0,0]表示第0行第0列，for循環不斷更新j，最終就是[0，2]表示0行2列
        #print(grad)
    return grad
#numpy中的ravel()、flatten()、squeeze()都有將多維數組轉換爲一維數組的功能，區別：
#ravel()：如果沒有必要，不會產生源數據的副本。這邊的error就是通過ravel函數將一百行一列拉成一行100列
#flatten()：返回源數據的副本
#squeeze()：只能對維數爲1的維度降維

#三種梯度下降方法與三種停止策略

#三種停止策略
STOP_ITER=0 #根據指定的迭代次數來停止，更新一次參數相當於一次迭代
STOP_COST=1 #根據損失，每次迭代看一下迭代之前和之後的目標函數（損失值）的變化，沒啥變化的話，可以停止
STOP_GRAD=2 #根據梯度，梯度變化賊小賊小

#設定三種不同的停止策略
def stopCriterion(type,value,threshold):
    if type == STOP_ITER: return value > threshold #threshold指的是指定的迭代次數
    elif type ==STOP_COST: return  abs(value[-1]-value[-2]) < threshold#threshold指的損失函數的閾值，如果小於這個值就停止
    elif type == STOP_GRAD: return np.linalg.norm(value) < threshold#threshold指的梯度值，梯度小於這個值停止。np.linalg.norm(）表示範數，首先需要注意的是範數是對向量（或者矩陣）的度量，是一個標量（scalar）

#對數據進行洗牌，使模型的泛化能力更強
def shuffleData(data):
    np.random.shuffle(data)#shuffle() 方法將序列的所有元素隨機排序。
    cols=data. shape[1]
    X = data[:,0:cols-1]
    y = data[:,cols-1:]
    return X,y

import time
def descent(data,theta,batchsize,stoptype,thresh,alpha):
    init_time = time.time() #比較不同梯度下降方法的運行速度
    i=0 #迭代次數
    k=0 #batchsize初始值
    X,y=shuffleData(data)#shuffleData函數對數據重新洗牌作用
    grad = np.zeros(theta.shape)#計算梯度
    costs= [cost(X,y,theta)]#計算損失

    while True:
        grad=gradient(X[k:k+batchsize],y[k:k+batchsize],theta)#grad的輸出梯度結果爲一行三列，
        k+=batchsize #取樣本數據
        if k>= n :
            k=0
            X,y=shuffleData(data)
        theta=theta-alpha * grad#theta是一行三列，減去alpha*grad的一行三列，也就是更新後的theta
        costs.append(cost(X,y,theta))#將損失值添加到costs中。append() 方法向列表的尾部添加一個新的元素。
        i+=1;

        if stoptype==STOP_ITER: value=i
        if stoptype==STOP_COST:value=costs
        if stoptype==STOP_GRAD:value=grad

        if stopCriterion(stoptype,value,thresh):break

    return theta,i-1,costs,grad,time.time()-init_time

def runExpe(data,theta,batchsize,stoptype,thresh,alpha):
    theta, iter, costs, grad, dur = descent(data,theta,batchsize,stoptype,thresh,alpha)
    if (data[:,1]>2).sum()>1:
        name="Original"
    else:
        name="Scaled"

    name=name + " data - learning rate: {} -".format(alpha)

    if batchsize==n:
        strDescType = "Gradient" #batchsize等於n是全局隨機梯度下降
    elif batchsize==1:
        strDescType = "Stochastic" #batchsize等於1是隨機梯度下降
    else:
        strDescType = "Mini-batch({})".format(batchsize)#小批量梯度下降

    name = name + strDescType + " descent - stop :"

    if stoptype==STOP_ITER:
        strstop = "{} iterations".format(thresh)
    elif stoptype==STOP_COST:
        strstop = "cost change < {}".format(thresh)
    else:
        strstop = "gradient norm < {}".format(thresh)

    name = name + strstop
    print("*{}\nTheta: {} - Iter: {} - Last cost: {:03.2f} - Duration: {:03.2f}s ".format(name,theta,iter,costs[-1],dur))

    p=plt.subplots(figsize=(12,4))
    plt.plot(np.arange(len(costs)),costs,'r')
    plt.xlabel('Iterations')
    plt.ylabel('costs')
    plt.title(name.upper() + ' - Error vs. Iteration')# upper() 方法將字符串中的小寫字母轉爲大寫字母
    plt.show()
    return theta

n=100;#梯度計算時選取多少個樣本 基於所有的樣本
#對比不同的停止策略
runExpe(orig_data, theta, n, STOP_ITER, thresh=5000, alpha=0.000001) #迭代5000次，即停止策略基於迭代次數

# 不指定迭代次數，停止策略基於損失函數
runExpe(orig_data, theta, n, STOP_COST, thresh=0.000001, alpha=0.001)
# 停止策略基於梯度值
runExpe(orig_data, theta, n, STOP_GRAD, thresh=0.05, alpha=0.001)

# 對比不同的梯度下降方法
runExpe(orig_data, theta, 1, STOP_ITER, thresh=5000, alpha=0.001)  # 隨機梯度下降，浮動大，穩定性差
runExpe(orig_data, theta, 1, STOP_ITER, thresh=15000, alpha=0.000002)

runExpe(orig_data, theta, 16, STOP_ITER, thresh=15000, alpha=0.001)  # 小批量梯度下降

#數據標準化問題，當數據發生浮動，先在數據層面上處理，先處理數據，再處理模型
from sklearn import preprocessing as pp

scaled_data = orig_data.copy()
scaled_data[:, 1:3] = pp.scale(orig_data[:, 1:3])#scale 零均值單位方差，將數據轉化爲標準正態分佈

runExpe(scaled_data, theta, n, STOP_ITER, thresh=5000, alpha=0.001)  # 0.38
runExpe(scaled_data, theta, n, STOP_GRAD, thresh=0.02, alpha=0.001)  # 0.22
runExpe(scaled_data, theta, 1, STOP_GRAD, thresh=0.002 / 5, alpha=0.001)  # 0.22
theta = runExpe(scaled_data, theta, 16, STOP_GRAD, thresh=0.002 * 2, alpha=0.001)

#以下——得到精度
def predict(X,theta):
    return [1 if x >=0.5 else 0 for x in model(X,theta)]#設定閾值爲0.5，大於0.5就可以入學

scaled_X = scaled_data[:,:3]
y = scaled_data[:,3]
prediction = predict(scaled_X,theta)
correct = [1 if ((a==1 and b==1) or (a==0 and b==0)) else 0 for (a,b) in zip(prediction,y)]#a對應的prediction值，b對應的真實值y
accuracy = (sum(map(int,correct)) % len(correct))#map() 會根據提供的函數對指定序列做映射。這邊將correct映射爲整數型在進行求和
print("accuracy {0}%".format(accuracy))

2.接下來詳細的介紹一下不同的停止策略在上面代碼中已經給出

2.1 批量梯度下降爲例（batchsize=n=100）

停止條件爲迭代次數5000

runExpe(orig_data, theta, n, STOP_ITER, thresh=5000, alpha=0.000001)

*Original data - learning rate: 1e-06 -Gradient descent - stop :5000 iterations
Theta: [[-0.00027127  0.00705232  0.00376711]] - Iter: 5000 - Last cost: 0.63 - Duration: 0.95s

看似損失值已經穩定在最低點0.63

停止條件爲損失值

設定閾值爲0.000001，需要迭代110000次左右

runExpe(orig_data, theta, n, STOP_COST, thresh=0.000001, alpha=0.001)

*Original data - learning rate: 0.001 -Gradient descent - stop :cost change < 1e-06
Theta: [[-5.13364014  0.04771429  0.04072397]] - Iter: 109901 - Last cost: 0.38 - Duration: 20.22s

損失值最低爲0.38，似乎還可以進一步收斂

停止條件爲梯度大小

設定閾值0.05，需要迭代40000次左右

runExpe(orig_data, theta, n, STOP_GRAD, thresh=0.05, alpha=0.001)

*Original data - learning rate: 0.001 -Gradient descent - stop :gradient norm < 0.05
Theta: [[-2.37033409  0.02721692  0.01899456]] - Iter: 40045 - Last cost: 0.49 - Duration: 7.67s

損失值最小爲0.49，似乎還可以進一步收斂
綜上，基於批量梯度下降方法，上述三種停止條件得到的損失函數值爲0.63、0.38和0.49，迭代次數分別爲5000次、110000次和40000次，迭代次數越多，損失值越小

3.對比不同的梯度下降方法

停止策略爲迭代次數

3.1 隨機梯度下降

runExpe(orig_data, theta, 1, STOP_ITER, thresh=5000, alpha=0.001)

*Original data - learning rate: 0.001 -Stochastic descent - stop :5000 iterations
Theta: [[-0.38740891  0.09055956 -0.06431339]] - Iter: 5000 - Last cost: 0.91 - Duration: 0.31s

波動非常大，迭代過程不穩定，這也是隨機梯度下降的主要缺點
嘗試降低學習率爲0.000002，增加迭代次數爲15000

runExpe(orig_data, theta, 1, STOP_ITER, thresh=15000, alpha=0.000002)

*Original data - learning rate: 2e-06 -Stochastic descent - stop :15000 iterations
Theta: [[-0.00202428  0.00981444  0.00076997]] - Iter: 15000 - Last cost: 0.63 - Duration: 0.93s

效果要好一些，損失值似乎穩定在0.63,根據上面的結果可知，0.63不算是一個特別合理的值

3.2 小批量梯度下降

#取樣本爲16
runExpe(orig_data, theta, 16, STOP_ITER, thresh=15000, alpha=0.001)

*Original data - learning rate: 0.001 -Mini-batch(16) descent - stop :15000 iterations
Theta: [[-1.03955915  0.01512354  0.00209486]] - Iter: 15000 - Last cost: 0.60 - Duration: 1.18s

上下波動，迭代過程不穩定

對於一些數據，降低學習率之後沒有效果，迭代過程依舊不穩定
因此，可能不是模型本身的問題，而是數據本身的問題，嘗試着對數據做一些變換，此處對數據進行標準化，用標準化後的數據求解

4.數據標準化

數據標準化問題，當數據發生浮動，先在數據層面上處理，先處理數據，再處理模型

from sklearn import preprocessing as pp

scaled_data = orig_data.copy()
scaled_data[:, 1:3] = pp.scale(orig_data[:, 1:3])#scale 零均值單位方差，將數據轉化爲標準正態分佈

接下來再來看看經過標準化處理後的數據得到的損失值圖

runExpe(scaled_data, theta, n, STOP_ITER, thresh=5000, alpha=0.001)  # 0.38

*Scaled data - learning rate: 0.001 -Gradient descent - stop :5000 iterations
Theta: [[0.3080807  0.86494967 0.77367651]] - Iter: 5000 - Last cost: 0.38 - Duration: 0.98s

runExpe(scaled_data, theta, n, STOP_GRAD, thresh=0.02, alpha=0.001)  # 0.22

*Scaled data - learning rate: 0.001 -Gradient descent - stop :gradient norm < 0.02
Theta: [[1.0707921  2.63030842 2.41079787]] - Iter: 59422 - Last cost: 0.22 - Duration: 12.18s

runExpe(scaled_data, theta, 1, STOP_GRAD, thresh=0.002 / 5, alpha=0.001)  # 0.22

*Scaled data - learning rate: 0.001 -Stochastic descent - stop :gradient norm < 0.0004
Theta: [[1.1486333  2.79230152 2.56637779]] - Iter: 72596 - Last cost: 0.22 - Duration: 5.63s

runExpe(scaled_data, theta, 16, STOP_GRAD, thresh=0.002 * 2, alpha=0.001)

*Scaled data - learning rate: 0.001 -Mini-batch(16) descent - stop :gradient norm < 0.004
Theta: [[1.09946538 2.6858268  2.46623512]] - Iter: 63755 - Last cost: 0.22 - Duration: 6.40s

可以發現，經過標準化處理的數據得到的損失值得到明顯的改善。

機器學習實戰：邏輯迴歸+梯度下降

使用梯度下降的方法進行邏輯迴歸實戰：

2.接下來詳細的介紹一下不同的停止策略在上面代碼中已經給出

2.1 批量梯度下降爲例（batchsize=n=100）

停止條件爲迭代次數5000

停止條件爲損失值

停止條件爲梯度大小

3.對比不同的梯度下降方法

3.1 隨機梯度下降

3.2 小批量梯度下降

4.數據標準化

python gdal 安裝使用（Windows， python 3.6.8）

機器學習之線性迴歸、梯度下降以及引入正則化

機器學習實戰：邏輯迴歸+梯度下降

機器學習中多樣本情況下的正向傳播與反向傳播推導（手寫）

簡單的神經網絡

機器學習實戰：TensorFlow構建簡單卷積神經網絡

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結