下例爲乳房癌的概率,通過多個角度評估模型。
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# 1)讀取數據
cancer = load_breast_cancer()
# 2)數據集劃分
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target)
# 3)特徵工程
transformer = StandardScaler()
x_train = transformer.fit_transform(x_train)
x_test = transformer.fit_transform(x_test)
# 4)KNN算法
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
# 5)模型評估
score = estimator.score(x_test, y_test)
print(u"準確率: \n", score)
y_predict = estimator.predict(x_test)
準確率:
0.986013986013986
/Users/cleland/.pyenv/versions/3.7.1/envs/base/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
準確度
accuracy = 正確的數據量 / 樣本量
def score(y_predict, y_ture):
return sum(y_predict == y_test) / y_ture.size
print(u"準確率: ", score(y_predict, y_test))
print(u"sklearn 準確率: ", estimator.score(x_test, y_test))
準確率: 0.986013986013986
sklearn 準確率: 0.986013986013986
混淆矩陣
- 0 - Negative(陰性,消極)
- 1 - Postive(陽性,積極)
預測值_0 | 預測值_1 | |
---|---|---|
真實值_0 | TN | FP |
真實值_1 | FN | TP |
def TN(y_predict, y_true):
return sum((y_predict == 0) & (y_true == 0))
def FN(y_predict, y_true):
return sum((y_predict == 0) & (y_true == 1))
def FP(y_predict, y_true):
return sum((y_predict == 1) & (y_true == 0))
def TP(y_predict, y_true):
return sum((y_predict == 1) & (y_true == 1))
print("TN: ", TN(y_predict, y_test))
print("FN: ", FN(y_predict, y_test))
print("FP: ", FP(y_predict, y_test))
print("TP: ", TP(y_predict, y_test))
from sklearn.metrics import confusion_matrix
print(u'sklearn 混淆矩陣: \n', confusion_matrix(y_test, y_predict))
TN: 51
FN: 1
FP: 1
TP: 90
sklearn 混淆矩陣:
[[51 1]
[ 1 90]]
精確度
def precision(y_predict, y_true):
result = 0.0
try:
result = TP(y_predict, y_true) / (TP(y_predict, y_true) + FP(y_predict, y_true))
except ZeroDivisionError:
result = 0.0
return result
print(u"精確率: \n", precision(y_predict, y_test))
from sklearn.metrics import precision_score
print(u'sklearn 精確度: \n', precision_score(y_test, y_predict))
精確率:
0.989010989010989
sklearn 精確度:
0.989010989010989
召回率
def recall(y_predict, y_true):
result = 0.0
try:
result = TP(y_predict, y_true) / (TP(y_predict, y_true) + FN(y_predict, y_true))
except ZeroDivisionError:
result = 0.0
return result
print(u"召回率: \n", recall(y_predict, y_test))
from sklearn.metrics import recall_score
print(u'sklearn 召回率: \n', recall_score(y_test, y_predict))
召回率:
0.989010989010989
sklearn 召回率:
0.989010989010989
F1 Score
def f1(y_predict, y_true):
precision_score = precision(y_predict, y_true)
recall_score = recall(y_predict, y_true)
result = 0.0
try:
result = (2 * precision_score * recall_score) / (precision_score + recall_score)
except ZeroDivisionError:
result = 0.0
return result
print(u"F1 Score: \n", f1(y_predict, y_test))
from sklearn.metrics import f1_score
print(u'sklearn F1 Score: \n', f1_score(y_test, y_predict))
F1 Score:
0.989010989010989
sklearn F1 Score:
0.989010989010989
平衡精確率與召回率
decision_scores = estimator.decision_function(x_test)
自己繪製準確率與召回率的關係
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
precisions = []
recalls = []
steps = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
for step in steps:
y_predict = np.array(decision_scores >= step, dtype='int')
precisions.append(precision_score(y_test, y_predict))
recalls.append(recall_score(y_test, y_predict))
plt.plot(steps, precisions, label='precisions')
plt.plot(steps, recalls, label='recalls')
plt.legend(loc='lower left')
plt.show()
plt.plot(precisions, recalls)
plt.show()
sklearn
from sklearn.metrics import precision_recall_curve
precisions, recalls, steps = precision_recall_curve(y_test, decision_scores)
print('precisions', precisions.shape)
print('recalls', recalls.shape)
print('steps', steps.shape)
precisions (93,)
recalls (93,)
steps (92,)
plt.plot(steps, precisions[:-1], label='precisions')
plt.plot(steps, recalls[:-1], label='recalls')
plt.legend(loc='lower left')
plt.show()
ROC
from sklearn.metrics import roc_curve
fprs, tprs, steps = roc_curve(y_test, decision_scores)
plt.plot(fprs, tprs)
plt.show()