import numpy as np
import pandas as pd
import math
from sklearn import datasets
class LogisticRegression:
def __init__(self,learning_rate=0.001,max_iter=100):
self.learning_rate = learning_rate
self.max_iter = max_iter
def fit(self,X,y):
self.X = X
self.y = y
self.row = self.X.shape[0]
self.col = self.X.shape[1]
self.theta = np.ones(self.col) # 每一維對應一個theta,但是搞不清楚的是需不需要加一個常數維(1)
self.lr = np.ones(self.col) * self.learning_rate # 學習率(每個theta對應一個學習率)
for i in range(self.max_iter):
self.gradient_descent()
def sigmoid(self,X):
return 1.0 / (1 + np.exp(-X))
def H(self,X=None):
if X is None:
X = self.X
theta_X = np.sum(X * self.theta, axis=1) #矩陣運算,返回的是self.row個數的向量
return self.sigmoid(theta_X)
def cal_gradient(self):
# return np.mean(((self.H(self.X) - self.y) * self.X.T),axis=1) #返回self.col個梯度:這個是標準的,下降的慢
return np.sum(((self.H(self.X) - self.y) * self.X.T),axis=1) # 返回self.col個梯度:這個不標準,下降的快
def gradient_descent(self):
temp_theta = self.cal_gradient()
print("cal_gradient:::",temp_theta.shape)
self.theta -= self.lr * temp_theta
def loadDataSet1(filename):
dataMat = []
labelMat = []
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return np.array(dataMat),np.array(labelMat)
filename=r'E:\dataset\testSet.txt' #文件目錄
def loadDataSet(): #讀取數據(這裏只有兩個特徵)
dataMat = []
labelMat = []
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #前面的1,表示方程的常量。比如兩個特徵X1,X2,共需要三個參數,W1+W2*X1+W3*X2
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
def plotBestFit(weights): #畫出最終分類的圖
import matplotlib.pyplot as plt
dataMat,labelMat=loadDataSet()
dataArr = np.array(dataMat)
n = np.shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = np.arange(-3.0, 3.0, 0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
if __name__ == '__main__':
filename = r'E:\dataset\testSet.txt' # 文件目錄
X,y = loadDataSet1(filename=filename)
print(type(X))
print(X.shape)
LRC= LogisticRegression(learning_rate=0.001,max_iter=500)
LRC.fit(X,y)
theta = LRC.theta
print(theta)
plotBestFit(theta)
使用的是批量梯度下降方法。
'''
不同之處是,本次代碼實現了運算的向量化,即矩陣運算
'''
import numpy as np
import pandas as pd
import math
from sklearn import datasets
class LogisticRegression:
def __init__(self,learning_rate=0.001,max_iter=100):
self.learning_rate = learning_rate
self.max_iter = max_iter
def fit(self,X,y):
self.X = np.mat(X)
self.y = np.mat(y).T
self.row = self.X.shape[0]
self.col = self.X.shape[1]
self.theta = np.mat(np.ones(self.col)).T # 每一維對應一個theta,但是搞不清楚的是需不需要加一個常數維(1)
self.lr = np.ones(self.col) * self.learning_rate # 學習率(每個theta對應一個學習率)
for i in range(self.max_iter):
self.gradient_descent()
def sigmoid(self,X):
return 1.0 / (1 + np.exp(-X))
def H(self,X=None):
if X is None:
X = self.X
theta_X = X.dot(self.theta)
# theta_X = np.sum(X * self.theta, axis=1) #矩陣運算,返回的是self.row個數的向量
return self.sigmoid(theta_X)
def cal_gradient(self):
# return np.mean(((self.H(self.X) - self.y) * self.X.T),axis=1) #返回self.col個梯度:這個是標準的,下降的慢
# return np.sum(((self.H(self.X) - self.y) * self.X.T),axis=1) # 返回self.col個梯度:這個不標準,下降的快
return (self.X.T).dot(self.H(self.X)-self.y)
def gradient_descent(self):
temp_theta = self.cal_gradient()
print("cal_gradient:::",temp_theta.shape)
print("self.theta:::",self.theta.shape)
self.theta -= self.learning_rate * temp_theta
def loadDataSet1(filename):
dataMat = []
labelMat = []
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return np.array(dataMat),np.array(labelMat)
filename=r'E:\dataset\testSet.txt' #文件目錄
def loadDataSet(): #讀取數據(這裏只有兩個特徵)
dataMat = []
labelMat = []
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #前面的1,表示方程的常量。比如兩個特徵X1,X2,共需要三個參數,W1+W2*X1+W3*X2
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
def plotBestFit(weights): #畫出最終分類的圖
import matplotlib.pyplot as plt
dataMat,labelMat=loadDataSet()
dataArr = np.array(dataMat)
n = np.shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = np.arange(-3.0, 3.0, 0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
if __name__ == '__main__':
filename = r'E:\dataset\testSet.txt' # 文件目錄
X,y = loadDataSet1(filename=filename)
print(type(X))
print(X.shape)
LRC= LogisticRegression(learning_rate=0.001,max_iter=500)
LRC.fit(X,y)
theta = LRC.theta.getA()
print(theta)
plotBestFit(theta)
畫圖部分修改
'''
不同之處是,本次代碼實現了運算的向量化,即矩陣運算
'''
import numpy as np
import pandas as pd
import math
from sklearn import datasets
import matplotlib.pyplot as plt
class LogisticRegression:
def __init__(self,learning_rate=0.001,max_iter=100):
self.learning_rate = learning_rate
self.max_iter = max_iter
def fit(self,X,y):
self.X = np.mat(X)
self.y = np.mat(y).T
self.row = self.X.shape[0]
self.col = self.X.shape[1]
self.theta = np.mat(np.ones(self.col)).T # 每一維對應一個theta,但是搞不清楚的是需不需要加一個常數維(1)
self.lr = np.ones(self.col) * self.learning_rate # 學習率(每個theta對應一個學習率)
for i in range(self.max_iter):
self.gradient_descent()
def sigmoid(self,X):
return 1.0 / (1 + np.exp(-X))
def H(self,X=None):
if X is None:
X = self.X
theta_X = X.dot(self.theta)
# theta_X = np.sum(X * self.theta, axis=1) #矩陣運算,返回的是self.row個數的向量
return self.sigmoid(theta_X)
def cal_gradient(self):
# return np.mean(((self.H(self.X) - self.y) * self.X.T),axis=1) #返回self.col個梯度:這個是標準的,下降的慢
# return np.sum(((self.H(self.X) - self.y) * self.X.T),axis=1) # 返回self.col個梯度:這個不標準,下降的快
return (self.X.T).dot(self.H(self.X)-self.y)
def gradient_descent(self):
temp_theta = self.cal_gradient()
# print("cal_gradient:::",temp_theta.shape)
# print("self.theta:::",self.theta.shape)
self.theta -= self.learning_rate * temp_theta
def loadDataSet1(filename):
dataMat = []
labelMat = []
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return np.array(dataMat),np.array(labelMat)
filename=r'E:\dataset\testSet.txt' #文件目錄
def loadDataSet(): #讀取數據(這裏只有兩個特徵)
dataMat = []
labelMat = []
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #前面的1,表示方程的常量。比如兩個特徵X1,X2,共需要三個參數,W1+W2*X1+W3*X2
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
if __name__ == '__main__':
filename = r'E:\dataset\testSet.txt' # 文件目錄
X,y = loadDataSet1(filename=filename)
print(type(X))
print(X.shape)
LRC= LogisticRegression(learning_rate=0.001,max_iter=500)
LRC.fit(X,y)
theta = LRC.theta.getA()
print(theta)
###畫圖不用那麼麻煩
plt.scatter(X[:,1],X[:,2],c=y)
a = np.arange(-4.0,4.0,0.1)
b = (-theta[0]-theta[1]*a)/theta[2]
plt.plot(a,b)
plt.show()