Python:代碼實現邏輯迴歸

import numpy as np
import pandas as pd
import math
from sklearn import datasets

class LogisticRegression:
    def __init__(self,learning_rate=0.001,max_iter=100):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
    def fit(self,X,y):
        self.X = X
        self.y = y
        self.row = self.X.shape[0]
        self.col = self.X.shape[1]
        self.theta = np.ones(self.col)                 # 每一維對應一個theta,但是搞不清楚的是需不需要加一個常數維(1)
        self.lr = np.ones(self.col) * self.learning_rate       # 學習率(每個theta對應一個學習率)
        for i in range(self.max_iter):
            self.gradient_descent()

    def sigmoid(self,X):
        return 1.0 / (1 + np.exp(-X))

    def H(self,X=None):
        if X is None:
            X = self.X
        theta_X = np.sum(X * self.theta, axis=1)    #矩陣運算,返回的是self.row個數的向量
        return self.sigmoid(theta_X)

    def cal_gradient(self):
        # return np.mean(((self.H(self.X) - self.y) * self.X.T),axis=1)   #返回self.col個梯度:這個是標準的,下降的慢
        return np.sum(((self.H(self.X) - self.y) * self.X.T),axis=1) # 返回self.col個梯度:這個不標準,下降的快

    def gradient_descent(self):
        temp_theta = self.cal_gradient()
        print("cal_gradient:::",temp_theta.shape)
        self.theta -= self.lr * temp_theta

def loadDataSet1(filename):
    dataMat = []
    labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
    return np.array(dataMat),np.array(labelMat)

filename=r'E:\dataset\testSet.txt' #文件目錄
def loadDataSet():   #讀取數據(這裏只有兩個特徵)
    dataMat = []
    labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])   #前面的1,表示方程的常量。比如兩個特徵X1,X2,共需要三個參數,W1+W2*X1+W3*X2
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat

def plotBestFit(weights):  #畫出最終分類的圖
    import matplotlib.pyplot as plt
    dataMat,labelMat=loadDataSet()
    dataArr = np.array(dataMat)
    n = np.shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if int(labelMat[i])== 1:
            xcord1.append(dataArr[i,1])
            ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1])
            ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    x = np.arange(-3.0, 3.0, 0.1)
    y = (-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()

if __name__ == '__main__':
    filename = r'E:\dataset\testSet.txt'  # 文件目錄
    X,y = loadDataSet1(filename=filename)
    print(type(X))
    print(X.shape)

    LRC= LogisticRegression(learning_rate=0.001,max_iter=500)
    LRC.fit(X,y)
    theta = LRC.theta
    print(theta)
    plotBestFit(theta)

使用的是批量梯度下降方法。

 

'''
不同之處是,本次代碼實現了運算的向量化,即矩陣運算
'''
import numpy as np
import pandas as pd
import math
from sklearn import datasets

class LogisticRegression:
    def __init__(self,learning_rate=0.001,max_iter=100):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
    def fit(self,X,y):
        self.X = np.mat(X)
        self.y = np.mat(y).T
        self.row = self.X.shape[0]
        self.col = self.X.shape[1]
        self.theta = np.mat(np.ones(self.col)).T             # 每一維對應一個theta,但是搞不清楚的是需不需要加一個常數維(1)
        self.lr = np.ones(self.col) * self.learning_rate       # 學習率(每個theta對應一個學習率)
        for i in range(self.max_iter):
            self.gradient_descent()

    def sigmoid(self,X):
        return 1.0 / (1 + np.exp(-X))

    def H(self,X=None):
        if X is None:
            X = self.X
        theta_X = X.dot(self.theta)

        # theta_X = np.sum(X * self.theta, axis=1)    #矩陣運算,返回的是self.row個數的向量
        return self.sigmoid(theta_X)

    def cal_gradient(self):
        # return np.mean(((self.H(self.X) - self.y) * self.X.T),axis=1)   #返回self.col個梯度:這個是標準的,下降的慢
        # return np.sum(((self.H(self.X) - self.y) * self.X.T),axis=1) # 返回self.col個梯度:這個不標準,下降的快
        return (self.X.T).dot(self.H(self.X)-self.y)

    def gradient_descent(self):
        temp_theta = self.cal_gradient()
        print("cal_gradient:::",temp_theta.shape)
        print("self.theta:::",self.theta.shape)
        self.theta -= self.learning_rate * temp_theta

def loadDataSet1(filename):
    dataMat = []
    labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
    return np.array(dataMat),np.array(labelMat)

filename=r'E:\dataset\testSet.txt' #文件目錄
def loadDataSet():   #讀取數據(這裏只有兩個特徵)
    dataMat = []
    labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])   #前面的1,表示方程的常量。比如兩個特徵X1,X2,共需要三個參數,W1+W2*X1+W3*X2
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat

def plotBestFit(weights):  #畫出最終分類的圖
    import matplotlib.pyplot as plt
    dataMat,labelMat=loadDataSet()
    dataArr = np.array(dataMat)
    n = np.shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if int(labelMat[i])== 1:
            xcord1.append(dataArr[i,1])
            ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1])
            ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    x = np.arange(-3.0, 3.0, 0.1)
    y = (-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()

if __name__ == '__main__':
    filename = r'E:\dataset\testSet.txt'  # 文件目錄
    X,y = loadDataSet1(filename=filename)
    print(type(X))
    print(X.shape)

    LRC= LogisticRegression(learning_rate=0.001,max_iter=500)
    LRC.fit(X,y)
    theta = LRC.theta.getA()
    print(theta)
    plotBestFit(theta)

畫圖部分修改

'''
不同之處是,本次代碼實現了運算的向量化,即矩陣運算
'''
import numpy as np
import pandas as pd
import math
from sklearn import datasets
import matplotlib.pyplot as plt

class LogisticRegression:
    def __init__(self,learning_rate=0.001,max_iter=100):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
    def fit(self,X,y):
        self.X = np.mat(X)
        self.y = np.mat(y).T
        self.row = self.X.shape[0]
        self.col = self.X.shape[1]
        self.theta = np.mat(np.ones(self.col)).T             # 每一維對應一個theta,但是搞不清楚的是需不需要加一個常數維(1)
        self.lr = np.ones(self.col) * self.learning_rate       # 學習率(每個theta對應一個學習率)
        for i in range(self.max_iter):
            self.gradient_descent()

    def sigmoid(self,X):
        return 1.0 / (1 + np.exp(-X))

    def H(self,X=None):
        if X is None:
            X = self.X
        theta_X = X.dot(self.theta)

        # theta_X = np.sum(X * self.theta, axis=1)    #矩陣運算,返回的是self.row個數的向量
        return self.sigmoid(theta_X)

    def cal_gradient(self):
        # return np.mean(((self.H(self.X) - self.y) * self.X.T),axis=1)   #返回self.col個梯度:這個是標準的,下降的慢
        # return np.sum(((self.H(self.X) - self.y) * self.X.T),axis=1) # 返回self.col個梯度:這個不標準,下降的快
        return (self.X.T).dot(self.H(self.X)-self.y)

    def gradient_descent(self):
        temp_theta = self.cal_gradient()
        # print("cal_gradient:::",temp_theta.shape)
        # print("self.theta:::",self.theta.shape)
        self.theta -= self.learning_rate * temp_theta

def loadDataSet1(filename):
    dataMat = []
    labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
    return np.array(dataMat),np.array(labelMat)

filename=r'E:\dataset\testSet.txt' #文件目錄
def loadDataSet():   #讀取數據(這裏只有兩個特徵)
    dataMat = []
    labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])   #前面的1,表示方程的常量。比如兩個特徵X1,X2,共需要三個參數,W1+W2*X1+W3*X2
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat

if __name__ == '__main__':
    filename = r'E:\dataset\testSet.txt'  # 文件目錄
    X,y = loadDataSet1(filename=filename)
    print(type(X))
    print(X.shape)

    LRC= LogisticRegression(learning_rate=0.001,max_iter=500)
    LRC.fit(X,y)
    theta = LRC.theta.getA()
    print(theta)

    ###畫圖不用那麼麻煩
    plt.scatter(X[:,1],X[:,2],c=y)
    a = np.arange(-4.0,4.0,0.1)
    b = (-theta[0]-theta[1]*a)/theta[2]
    plt.plot(a,b)

    plt.show()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章