import numpy as np
import matplotlib.pyplot as plt
%matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Using matplotlib backend: MacOSX
邏輯迴歸(Logistic Regression), 是使用迴歸的實現的分類算法,只能解決二分類問題. 在Kaggle競賽中,LR算法以63.5,榮獲"出場率最高算法"
假定f(x)爲一個線性迴歸問題,他的值域(-infinity,+infinity)
使用Sigmoid函數限定爲概率的值域[0,1]
Sigmoid函數
值域(0,1)
- t0時候, p=0.5
- t>0時候, p>0.5
- t<0時候, p<0.5
# 返回e的冪次方,e是一個常數爲2.71828。將線性方程轉化爲0-1之間的概率。
def sigmoid(t):
return 1.0 / (1+np.exp(-t))
x = np.linspace(-10, 10, 500)
plt.plot(x, sigmoid(x))
plt.show()
實現邏輯迴歸
邏輯迴歸的模型
問題: 給定的樣本集X,y, 如何找到對應的參數Theta(使用梯度下降搜索),是的最大程度獲得樣本數據集X對應的分類輸出y?
解決方法: 最小話損失函數,使用梯度下降找到對應的Theta
假設損失函數
從邏輯迴歸的模型可看出
- 當y=1時, p值越小,損失越大;p值越大,損失越小。
- 當y=0時, p值越小,損失越小;p值越大,損失越大。
用一下函數來模擬這個現象:
進一步推導:
=>
=>
以上方程無公式解, 只能使用梯度下降法求解
import numpy as np
class LogisticRegression:
def __init__(self):
self._thetas = None
# 截距
self.intercept = None
# 參數係數
self.coefs = None
@staticmethod
def sigmoid(t):
return 1.0 / (1 + np.exp(-t))
def j(self, x, y, theta):
"""目標函數"""
try:
sig = self.sigmoid(x.dot(theta))
return -(np.sum(y.dot(np.log(sig)) + (1-y).dot(1-sig))) / len(x)
except:
# 當數據特別大,報錯時,返回無窮大。
return float('inf')
def dj_debug(self, x, y, theta, epsilon=0.01):
"""梯度調試"""
res = np.empty(len(theta))
for i in range(len(theta)):
theta_1 = theta.copy()
theta_1[i] += epsilon
theta_2 = theta.copy()
theta_2[i] -= epsilon
res[i] = (self.j(theta_1, x, y) - self.j(theta_2, x, y)) / (2 * epsilon)
return res
def dj(self, x, y, thetas):
"""求導(梯度)"""
sig = self.sigmoid(x.dot(thetas))
return (x.T).dot(sig-y)
def gradient_descent(self, dj, x, y, initial_thetas, eta, epsilon, max_iters):
"""梯度下降"""
thetas = initial_thetas
while max_iters > 0:
# 梯度gradient
gradient = dj(x, y, thetas)
last_thetas = thetas
thetas = thetas - eta * gradient
if(abs(self.j(x, y, thetas) - self.j(x, y, last_thetas)) < epsilon):
break
max_iters -= 1
self._thetas = thetas
self.intercept = thetas[0]
self.coefs = thetas[1:]
def fit(self, x_train, y_train, eta=0.01, epsilon=1e-8, max_iters=1e4, debug=False):
"""訓練"""
# 加上一列全爲1
X_b = np.hstack([np.ones((len(x_train), 1)), x_train])
initial_thetas = np.zeros(X_b.shape[1])
if debug:
self.gradient_descent(self.dj_debug, X_b, y_train, initial_thetas, eta, epsilon, max_iters)
else:
self.gradient_descent(self.dj, X_b, y_train, initial_thetas, eta, epsilon, max_iters)
def predict(self, x_predict):
X_b = np.hstack([np.ones((len(x_predict), 1)), x_predict])
p = self.sigmoid(X_b.dot(self._thetas))
p[p >= 0.5] = 1
p[p < 0.5] = 0
return p
def score(self, x, y):
"""accuracy=正確的數據量 / 樣本量"""
y_predict = self.predict(x)
return np.sum(y == y_predict) / len(y)
from sklearn import datasets
# 只使用兩種鶯尾花和兩個特徵
iris = datasets.load_iris()
X = iris.data
y = iris.target
X = X[y < 2, : 2]
y = y[y < 2]
# 訓練模型
log_reg = LogisticRegression()
log_reg.fit(X, y)
print('coefs: ', log_reg.coefs)
print('intercept: ', log_reg.intercept)
print('Score: ', log_reg.score(X, y))
coefs: [ 9.91317105 -11.51345535]
intercept: -17.935266908864193
Score: 0.99
plt.scatter(X[y==0,0], X[y==0,1], color="red")
plt.scatter(X[y==1,0], X[y==1,1], color="blue")
# 繪製分隔線
# x1 = np.arange(4, 8, 0.01)
x1 = np.linspace(4, 8, 1000)
y1 = x1 * (log_reg.coefs[0]/(-log_reg.coefs[1])) + (log_reg.intercept/-log_reg.coefs[1])
plt.plot(x1, y1)
plt.show()
決策邊界繪製
def plot_decision_boundary(model, axis):
""" 決策邊界繪製函數(2D)
"""
# x軸: axis[0]~axis[1]; y軸: axis[0]~axis[1]
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
兩類鶯尾花決策邊界繪製(兩個特徵)
plot_decision_boundary(log_reg, axis=[4, 7.5, 1.5, 4.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'
三類鶯尾花決策邊界繪製(兩個特徵)
# 全部類別的鶯尾花(兩個特徵),因爲有三類,而邏輯迴歸只能解決二分類問題,所以使用knn解決
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
# 只使用兩種鶯尾花和兩個特徵
iris = datasets.load_iris()
X_all = (iris.data)[:,:2]
y_all = iris.target
# 調整n_neighbors, 控制擬合度
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(X_all, y_all)
print('Score: ', knn.score(X_all, y_all))
# 繪製預測值和決策邊界
plot_decision_boundary(knn, axis=[4, 8, 1.5, 4.5])
plt.scatter(X_all[y_all==0,0], X_all[y_all==0,1])
plt.scatter(X_all[y_all==1,0], X_all[y_all==1,1])
plt.scatter(X_all[y_all==2,0], X_all[y_all==2,1])
plt.show()
Score: 0.84
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'
邏輯迴歸中添加多項式特徵
np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array((X[:,0]**2+X[:,1]**2)<1.5, dtype='int')
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
單純使用邏輯迴歸
如下,因爲現實中邊界類似於圓, 而邏輯迴歸得到的邊界是條直線,擬合很差。從而使得分比較低。
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X, y)
print('Score', log_reg.score(X, y))
Score 0.605
# 繪製預測值和決策邊界
plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'
加入多項式特性
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('log_reg', LogisticRegression())
])
poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X, y)
print('score: ', poly_log_reg.score(X, y))
score: 0.975
plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'
多分類問題
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target
log_reg = LogisticRegression()
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
OVR
from sklearn.multiclass import OneVsRestClassifier
ovr = OneVsRestClassifier(log_reg)
ovr.fit(X, y )
print('Score: ', ovr.score(X, y))
Score: 0.8066666666666666
# 繪製預測值和決策邊界
plot_decision_boundary(ovr, axis=[4, 8, 1.5, 5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.scatter(X[y==2,0], X[y==2,1])
plt.show()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'
OVO
from sklearn.multiclass import OneVsOneClassifier
ovo = OneVsOneClassifier(log_reg)
ovo.fit(X, y )
print('Score: ', ovo.score(X, y))
Score: 0.8333333333333334
# 繪製預測值和決策邊界
plot_decision_boundary(ovo, axis=[4, 8, 1.5, 5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.scatter(X[y==2,0], X[y==2,1])
plt.show()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:17: UserWarning: The following kwargs were not used by contour: 'linewidth'