介紹
本篇實現了使用
本篇使用
代碼實現
1. 加載需要的模塊
import numpy as np
import pandas as pd
from numpy import *
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
2. 使用
def load_BankNodeData():
'''
加載鈔票數據集的訓練集
:return: 訓練集的特徵,訓練集的label
'''
df = pd.read_csv(r'./data/train.txt', header=None)
# trainSet = np.array(df.loc[:][[0, 1, 2, 3]].values)
trainSet = np.array(df.loc[:][[0, 1, 2, 3]]) # trainSet.dtype = float64
print('train set: \n', trainSet)
labels = df.loc[:][4].values # labels.dtype = int64
labels = np.where(labels == 1, 1, -1)
print('lebel values: \n', labels)
return trainSet, labels
3. 使用隨機梯度下降算法求解
def logisticRegression_SGD(trainSet, labels, eta = 0.01, max_iter = 5000):
'''
:param trainSet: 訓練集
:param labels: 訓練集的y值
:param eta: 學習率,步長
:param iterTime: 最大迭代次數
:return: 權重;權重更新記錄,用戶觀測是否收斂
'''
sampleSize = len(labels)
featureSize = len(trainSet[0]) + 1
weights = random.rand(featureSize) # 權重
weightsRecord = [[x] for x in weights] # 權重更新記錄
print('initial weights: ', weights)
count = 0
while(count < max_iter):
sample = random.randint(0, sampleSize - 1)
update = logisticFunction(-labels[sample] * (np.dot(weights[1:], trainSet[sample]) + weights[0]))
weights[1:] = weights[1:] - eta * update * (-labels[sample] * trainSet[sample])
weights[0] = weights[0] - eta * update * (-labels[sample])
count += 1
if count % 500 == 0:
for i in range(featureSize):
weightsRecord[i].append(weights[i])
fout = open(r'./data/weightRecord.txt', 'w', encoding='utf-8')
for i in range(featureSize):
fout.write(','.join([str(i) for i in weightsRecord[i]]) + '\n')
fout.close()
return weights, weightsRecord
def logisticFunction(inputV):
'''
logistic函數
:param inputV: logistic函數輸入
:return: logistic函數值
'''
return 1.0 / (1.0 + np.exp(-inputV))
4. 可視化權重的變化趨勢,觀察其是否收斂
一個判斷優化算法優劣的可靠方法就是看它是否收斂
def plotWeightTrend():
'''
:return:
'''
df = pd.read_csv(r'./data/weightRecord.txt', header=None)
featureSize = df.values.shape[0]
iter_n = df.values.shape[1]
for i in range(featureSize):
plt.plot(range(iter_n), df.loc[i], lw = 1.5, label = 'w_' + str(i))
plt.legend(loc = 'upper left')
plt.show()
5. 計算模型在測試集上的表現
def preformence_BankNodeData(weights):
'''
:param weights: 模型的權重
:return: None
'''
df = pd.read_csv(r'./data/test.txt', header=None)
testSet = df.loc[:][[0, 1, 2, 3]].values # shape = 26, 4, dtype = float64
label = df.loc[:][4].values
pre = np.dot(testSet, weights[1:]) + weights[0] # pre.shape = (26, ), dtype = float64
error = 0
for i in range(pre.__len__()):
print('true labels\t:',label[i], 'predict\t:', np.where(logisticFunction(pre[i]) > 0.5, 1, 0), '(', logisticFunction(pre[i]), ')')
error += np.where((np.where(logisticFunction(pre[i]) > 0.5, 1, 0)) != label[i], 1, 0)
print('\033[1;32;40m error is', error / len(label), '\033[0m')
6. 主函數,使用logistic regression辨別真假鈔票
if __name__ == '__main__':
trainSet, labels = load_BankNodeData()
weights, weights_record = logisticRegression_SGD(trainSet, labels, 0.1, 1500000)
plotWeightTrend()
preformence_BankNodeData(weights)
運行程序會得到權重的變化情況如下:
通過下圖可以看到算法在測試集上的error爲
logistic regression 二分類實例
爲了看到
原始的數據是長這樣的:
我們的目的便是使用
def load_data():
'''
加載數據
:return: 返回訓練集和相應的y值
'''
df = pd.read_table(r'./data/testSet.txt', header=None)
trainSet = df.loc[:][[0, 1]].values #trainSet.dtype = float64
labels = df.loc[:][2] # labels.dtype = int64
labels = np.where(labels == 1, 1, -1)
return trainSet, labels
def fit():
'''
使用SGD算法進行模型的訓練,並繪製權重更新趨勢和分界面
:return:
'''
trainSet, labels = load_data()
weights, weightRecord = logisticRegression_SGD(trainSet, labels, 0.1, 100000)
print('\033[1;32;40m weights: ', weights, '\033[0m')
plotWeightTrend() # 繪製權重的更新趨勢
plot_decision_regions(trainSet, labels, weights) # 繪製分界面
def plot_decision_regions(X, y, weights, resolution = 0.02):
'''
繪製分類的邊界
:param X:
:param y:
:param weights: 訓練得到的模型的參數
:param resolution: 固定參數,繪圖使用
:return:
'''
colors = ['red', 'blue', 'black']
markers = ['o', 'x', '+']
# colorMap = ListedColormap(colors[:2])
x1_min, x1_max = X[:, 0].min(), X[:, 0].max()
x2_min, x2_max = X[:, 1].min(), X[:, 1].max()
X1, X2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
Z = np.array([X1.ravel(), X2.ravel()]).T
Z = predict(weights, Z)
Z = Z.reshape(X1.shape)
plt.contourf(X1, X2, Z, alpha = 0.5)
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
y = np.array(y)
for i, ylabel in enumerate(np.unique(y)):
plt.scatter(x = X[y == ylabel, 0], y = X[y == ylabel, 1], marker = markers[i], color = colors[i], s = 30)
plt.show()
def predict(weights, X):
'''
:param weights: 權重
:param X: 特徵
:return: 類別
'''
return np.where(logisticFunction(np.dot(X, array(weights[1:]).T) + weights[0]) > 0.5, 1, 0)
if __name__ == '__main__':
fit()
pass
運行程序首先得到的是權重的變化趨勢得到如下的結果圖:
然後繪製出使用