初識機器學習 | 6.邏輯迴歸

import numpy as np
import matplotlib.pyplot as plt

%matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using matplotlib backend: MacOSX

邏輯迴歸(Logistic Regression), 是使用迴歸的實現的分類算法，只能解決二分類問題. 在Kaggle競賽中，LR算法以63.5，榮獲"出場率最高算法"

$\hat{p}=f(x) \quad \hat{y}=\left\{\begin{array}{ll} 1, & \hat{p} \geq 0.5 \\ 0, & \hat{p} \leq 0.5 \end{array}\right.$

假定f(x)爲一個線性迴歸問題，他的值域(-infinity,+infinity)

$f(x)= x_{b} \cdot \theta$

使用Sigmoid函數限定爲概率的值域[0,1]

$\sigma(t)=\frac{1}{1+e^{-t}}$

Sigmoid函數

$\sigma(t)=\frac{1}{1+e^{-t}}$

值域(0,1)

t0時候, p=0.5
t>0時候, p>0.5
t<0時候, p<0.5

# 返回e的冪次方，e是一個常數爲2.71828。將線性方程轉化爲0-1之間的概率。
def sigmoid(t):
    return 1.0 / (1+np.exp(-t))

x = np.linspace(-10, 10, 500)
plt.plot(x, sigmoid(x))
plt.show()

實現邏輯迴歸

邏輯迴歸的模型

$\hat{p}=\sigma\left(f(x)\right)=\sigma\left(x_{b} \cdot {\theta}\right)=\frac{1}{1+e^{x_{b} \cdot {\theta}}}$

$\hat{y}=\left\{\begin{array}{ll} 1, & \hat{p} \geq 0.5 \\ 0, & \hat{p} \leq 0.5 \end{array}\right.$

問題: 給定的樣本集X,y, 如何找到對應的參數Theta(使用梯度下降搜索),是的最大程度獲得樣本數據集X對應的分類輸出y?

解決方法: 最小話損失函數，使用梯度下降找到對應的Theta

假設損失函數
從邏輯迴歸的模型可看出

當y=1時, p值越小，損失越大；p值越大，損失越小。
當y=0時, p值越小，損失越小；p值越大，損失越大。

用一下函數來模擬這個現象:

進一步推導:

$\operatorname{cost}=-y \log (\hat{p})-(1-y) \log (1-\hat{p})$

$J(\theta)=-\frac{1}{m} \sum_{i=1}^{m} y^{(i)} \log \left(\hat{p}^{(i)}\right)+\left(1-y^{(i)}\right) \log \left(1-\hat{p}^{(i)}\right)$

$J(\theta)=-\frac{1}{m} \sum_{i=1}^{m} y^{(i)} \log \left(\sigma\left(X_{b}^{(i)} \theta\right)\right)+\left(1-y^{(i)}\right) \log \left(1-\sigma\left(X_{b}^{(i)} \theta\right)\right)$

$\nabla J(\theta)=\frac{1}{m} \cdot\left(\begin{array}{c} \sum_{i=1}^{m}\left(\hat{y}^{(i)}-y^{(i)}\right) \\ \sum_{i=1}^{m}\left(\hat{y}^{(i)}-y^{(i)}\right) \cdot X_{1}^{(i)} \\ \sum_{i=1}^{m}\left(\hat{y}^{(i)}-y^{(i)}\right) \cdot X_{2}^{(i)} \\ \cdots \\ \sum_{i=1}^{m}\left(\hat{y}^{(i)}-y^{(i)}\right) \cdot X_{n}^{(i)} \\ \end{array}\right) =\frac{1}{m} \cdot X_{b}^{T} \cdot\left(\sigma\left(X_{b} \theta\right)-y\right)$

以上方程無公式解, 只能使用梯度下降法求解

import numpy as np


class LogisticRegression:
    def __init__(self):
        self._thetas = None
        # 截距
        self.intercept = None
        # 參數係數
        self.coefs = None

    @staticmethod
    def sigmoid(t):
        return 1.0 / (1 + np.exp(-t))

    def j(self, x, y, theta):
        """目標函數"""
        try:
            sig = self.sigmoid(x.dot(theta))
            return -(np.sum(y.dot(np.log(sig)) + (1-y).dot(1-sig))) / len(x)
        except:
            # 當數據特別大，報錯時，返回無窮大。
            return float('inf')

    def dj_debug(self, x, y, theta, epsilon=0.01):
        """梯度調試"""
        res = np.empty(len(theta))
        for i in range(len(theta)):
            theta_1 = theta.copy()
            theta_1[i] += epsilon
            theta_2 = theta.copy()
            theta_2[i] -= epsilon
            res[i] = (self.j(theta_1, x, y) - self.j(theta_2, x, y)) / (2 * epsilon)
        return res

    def dj(self, x, y, thetas):
        """求導(梯度)"""
        sig = self.sigmoid(x.dot(thetas))
        return (x.T).dot(sig-y)

    def gradient_descent(self, dj, x, y, initial_thetas, eta, epsilon, max_iters):
        """梯度下降"""
        thetas = initial_thetas
        while max_iters > 0:
            # 梯度gradient
            gradient = dj(x, y, thetas)
            last_thetas = thetas
            thetas = thetas - eta * gradient
            if(abs(self.j(x, y, thetas) - self.j(x, y, last_thetas)) < epsilon):
                break
            max_iters -= 1

        self._thetas = thetas
        self.intercept = thetas[0]
        self.coefs = thetas[1:]

    def fit(self, x_train, y_train, eta=0.01, epsilon=1e-8, max_iters=1e4, debug=False):
        """訓練"""
        # 加上一列全爲1
        X_b = np.hstack([np.ones((len(x_train), 1)), x_train])
        initial_thetas = np.zeros(X_b.shape[1])

        if debug:
            self.gradient_descent(self.dj_debug, X_b, y_train, initial_thetas, eta, epsilon, max_iters)
        else:
            self.gradient_descent(self.dj, X_b, y_train, initial_thetas, eta, epsilon, max_iters)

    def predict(self, x_predict):
        X_b = np.hstack([np.ones((len(x_predict), 1)), x_predict])
        p = self.sigmoid(X_b.dot(self._thetas))
        p[p >= 0.5] = 1
        p[p < 0.5] = 0

        return p

    def score(self, x, y):
        """accuracy=正確的數據量 / 樣本量"""
        y_predict = self.predict(x)
        return np.sum(y == y_predict) / len(y)

from sklearn import datasets

# 只使用兩種鶯尾花和兩個特徵
iris = datasets.load_iris()
X = iris.data
y = iris.target
X = X[y < 2, : 2]
y = y[y < 2]

# 訓練模型
log_reg = LogisticRegression()
log_reg.fit(X, y)
print('coefs: ', log_reg.coefs)
print('intercept: ', log_reg.intercept)
print('Score: ', log_reg.score(X, y))

coefs:  [  9.91317105 -11.51345535]
intercept:  -17.935266908864193
Score:  0.99

plt.scatter(X[y==0,0], X[y==0,1], color="red")
plt.scatter(X[y==1,0], X[y==1,1], color="blue")

# 繪製分隔線
# x1 = np.arange(4, 8, 0.01)
x1 = np.linspace(4, 8, 1000)
y1 = x1 * (log_reg.coefs[0]/(-log_reg.coefs[1])) + (log_reg.intercept/-log_reg.coefs[1])
plt.plot(x1, y1)
plt.show()

決策邊界繪製

def plot_decision_boundary(model, axis):
    """ 決策邊界繪製函數(2D)
    """
    # x軸: axis[0]~axis[1]; y軸: axis[0]~axis[1]
    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
    
    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

兩類鶯尾花決策邊界繪製(兩個特徵)

plot_decision_boundary(log_reg, axis=[4, 7.5, 1.5, 4.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'

三類鶯尾花決策邊界繪製(兩個特徵)

# 全部類別的鶯尾花(兩個特徵)，因爲有三類，而邏輯迴歸只能解決二分類問題，所以使用knn解決
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

# 只使用兩種鶯尾花和兩個特徵
iris = datasets.load_iris()
X_all = (iris.data)[:,:2]
y_all = iris.target
# 調整n_neighbors, 控制擬合度
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(X_all, y_all)
print('Score: ', knn.score(X_all, y_all))

# 繪製預測值和決策邊界
plot_decision_boundary(knn, axis=[4, 8, 1.5, 4.5])
plt.scatter(X_all[y_all==0,0], X_all[y_all==0,1])
plt.scatter(X_all[y_all==1,0], X_all[y_all==1,1])
plt.scatter(X_all[y_all==2,0], X_all[y_all==2,1])
plt.show()

Score:  0.84


/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'

邏輯迴歸中添加多項式特徵

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array((X[:,0]**2+X[:,1]**2)<1.5, dtype='int')

plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

單純使用邏輯迴歸

如下，因爲現實中邊界類似於圓, 而邏輯迴歸得到的邊界是條直線，擬合很差。從而使得分比較低。

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X, y)
print('Score', log_reg.score(X, y))

Score 0.605

# 繪製預測值和決策邊界
plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'

加入多項式特性

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

def PolynomialLogisticRegression(degree):
    return Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('log_reg', LogisticRegression())
    ])


poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X, y)
print('score: ', poly_log_reg.score(X, y))

score:  0.975

plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'

多分類問題

from sklearn import datasets
from sklearn.linear_model import LogisticRegression

iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target

log_reg = LogisticRegression()

plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

OVR

from sklearn.multiclass import OneVsRestClassifier


ovr = OneVsRestClassifier(log_reg)
ovr.fit(X, y )
print('Score: ', ovr.score(X, y))

Score:  0.8066666666666666

# 繪製預測值和決策邊界
plot_decision_boundary(ovr, axis=[4, 8, 1.5, 5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.scatter(X[y==2,0], X[y==2,1])
plt.show()

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'

OVO

from sklearn.multiclass import OneVsOneClassifier

ovo = OneVsOneClassifier(log_reg)
ovo.fit(X, y )
print('Score: ', ovo.score(X, y))

Score:  0.8333333333333334

# 繪製預測值和決策邊界
plot_decision_boundary(ovo, axis=[4, 8, 1.5, 5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.scatter(X[y==2,0], X[y==2,1])
plt.show()

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'

初識機器學習 | 6.邏輯迴歸

Sigmoid函數

實現邏輯迴歸

決策邊界繪製

兩類鶯尾花決策邊界繪製(兩個特徵)

三類鶯尾花決策邊界繪製(兩個特徵)

邏輯迴歸中添加多項式特徵

單純使用邏輯迴歸

加入多項式特性

多分類問題

OVR

OVO

win11關閉自動檢測病毒刪文件

千兆寬帶實際網速能到達多少？

加州房價預測模型

初識機器學習 | 4.線性迴歸

初識機器學習 | 8.聚類算法

初識機器學習 | 6.邏輯迴歸

免費代理IP池

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結