隨機森林案例一:宮頸癌預測

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import label_binarize
from sklearn import metrics
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
names = [u'Age', u'Number of sexual partners', u'First sexual intercourse',
       u'Num of pregnancies', u'Smokes', u'Smokes (years)',
       u'Smokes (packs/year)', u'Hormonal Contraceptives',
       u'Hormonal Contraceptives (years)', u'IUD', u'IUD (years)', u'STDs',
       u'STDs (number)', u'STDs:condylomatosis',
       u'STDs:cervical condylomatosis', u'STDs:vaginal condylomatosis',
       u'STDs:vulvo-perineal condylomatosis', u'STDs:syphilis',
       u'STDs:pelvic inflammatory disease', u'STDs:genital herpes',
       u'STDs:molluscum contagiosum', u'STDs:AIDS', u'STDs:HIV',
       u'STDs:Hepatitis B', u'STDs:HPV', u'STDs: Number of diagnosis',
       u'STDs: Time since first diagnosis', u'STDs: Time since last diagnosis',
       u'Dx:Cancer', u'Dx:CIN', u'Dx:HPV', u'Dx', u'Hinselmann', u'Schiller',
       u'Citology', u'Biopsy']#df.columns
path = "datas/risk_factors_cervical_cancer.csv"  # 數據文件路徑
data = pd.read_csv(path)
X = data[names[0:-4]]
Y = data[names[-4:]]
X.head(1)#隨機森林可以處理多個目標變量的情況
Age Number of sexual partners First sexual intercourse Num of pregnancies Smokes Smokes (years) Smokes (packs/year) Hormonal Contraceptives Hormonal Contraceptives (years) IUD ... STDs:HIV STDs:Hepatitis B STDs:HPV STDs: Number of diagnosis STDs: Time since first diagnosis STDs: Time since last diagnosis Dx:Cancer Dx:CIN Dx:HPV Dx
0 18 4.0 15.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0 ? ? 0 0 0 0

1 rows × 32 columns

#空值的處理
X = X.replace("?", np.NAN)
# 使用Imputer給定缺省值,默認的是以mean
# 對於缺省值,進行數據填充;默認是以列/特徵的均值填充
imputer = SimpleImputer(missing_values=np.nan)
X = imputer.fit_transform(X,Y)
#數據分割
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print ("訓練樣本數量:%d,特徵屬性數目:%d,目標屬性數目:%d" % (x_train.shape[0],x_train.shape[1],y_train.shape[1]))
print ("測試樣本數量:%d" % x_test.shape[0])
訓練樣本數量:686,特徵屬性數目:32,目標屬性數目:4
測試樣本數量:172
#標準化
ss = MinMaxScaler()#分類模型,經常使用的是minmaxscaler歸一化,迴歸模型經常用standardscaler
x_train = ss.fit_transform(x_train, y_train)
x_test = ss.transform(x_test)
x_train.shape
(686, 32)
#降維
pca = PCA(n_components=2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)
[0.24021831 0.2067443 ]
#隨機森林模型
forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=1, random_state=0)
forest.fit(x_train, y_train)#max_depth一般不宜設置過大,把每個模型作爲一個弱分類器
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=1, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
#模型效果評估
score = forest.score(x_test, y_test)
print ("準確率:%.2f%%" % (score * 100))
#模型預測
forest_y_score = forest.predict_proba(x_test)# prodict_proba輸出概率
#計算ROC值
forest_fpr1, forest_tpr1, _ = metrics.roc_curve(label_binarize(y_test[names[-4]],classes=(0,1,2)).T[0:-1].T.ravel(), forest_y_score[0].ravel())
forest_fpr2, forest_tpr2, _ = metrics.roc_curve(label_binarize(y_test[names[-3]],classes=(0,1,2)).T[0:-1].T.ravel(), forest_y_score[1].ravel())
forest_fpr3, forest_tpr3, _ = metrics.roc_curve(label_binarize(y_test[names[-2]],classes=(0,1,2)).T[0:-1].T.ravel(), forest_y_score[2].ravel())
forest_fpr4, forest_tpr4, _ = metrics.roc_curve(label_binarize(y_test[names[-1]],classes=(0,1,2)).T[0:-1].T.ravel(), forest_y_score[3].ravel())
#AUC值
auc1 = metrics.auc(forest_fpr1, forest_tpr1)
auc2 = metrics.auc(forest_fpr2, forest_tpr2)
auc3 = metrics.auc(forest_fpr3, forest_tpr3)
auc4 = metrics.auc(forest_fpr4, forest_tpr4)

print ("Hinselmann目標屬性AUC值:", auc1)
print ("Schiller目標屬性AUC值:", auc2)
print ("Citology目標屬性AUC值:", auc3)
print ("Biopsy目標屬性AUC值:", auc4)
準確率:89.53%
Hinselmann目標屬性AUC值: 0.9901974040021634
Schiller目標屬性AUC值: 0.9559221200648998
Citology目標屬性AUC值: 0.9637979989183343
Biopsy目標屬性AUC值: 0.9568685776095187
# label_binarize(y_test[names[-4]],classes=(0,1,2)).T[0:-1].T
forest_y_score[0][0]
# label_binarize(y_test[names[-4]],classes=(0,1,2))
# # y_test[names[-4]].value_counts()
array([0.95877997, 0.04122003])
label_binarize(['a','a','b','b'],classes=('a','b','c'))
label_binarize(['a','a','b','b'],classes=('a','b','c')).T[:-1].T.ravel()
array([1, 0, 1, 0, 0, 1, 0, 1])
# 正確的數據
y_true = label_binarize(y_test[names[-4]],classes=(0,1,2)).T[0:-1].T.ravel()
# 預測的數據 => 獲取第一個目標屬性的預測值,並將其轉換爲一維的數組
y_predict = forest_y_score[0].ravel()
# 計算的值
metrics.roc_curve(y_true, y_predict)
(array([0.        , 0.        , 0.        , 0.        , 0.00581395,
        0.00581395, 0.00581395, 0.00581395, 0.00581395, 0.00581395,
        0.00581395, 0.00581395, 0.01162791, 0.01162791, 0.01162791,
        0.01162791, 0.01162791, 0.01162791, 0.01162791, 0.01744186,
        0.03488372, 0.04651163, 0.0755814 , 0.11046512, 0.14534884,
        0.25      , 0.25581395, 0.26744186, 0.29069767, 0.46511628,
        0.47093023, 0.49418605, 0.59302326, 0.72093023, 0.76162791,
        0.90697674, 1.        ]),
 array([0.        , 0.09302326, 0.23837209, 0.27906977, 0.40697674,
        0.50581395, 0.52906977, 0.53488372, 0.70930233, 0.73255814,
        0.74418605, 0.75      , 0.85465116, 0.88953488, 0.9244186 ,
        0.95348837, 0.96511628, 0.98255814, 0.98837209, 0.98837209,
        0.98837209, 0.98837209, 0.98837209, 0.98837209, 0.98837209,
        0.99418605, 0.99418605, 0.99418605, 0.99418605, 0.99418605,
        0.99418605, 0.99418605, 0.99418605, 1.        , 1.        ,
        1.        , 1.        ]),
 array([1.96018516, 0.96018516, 0.95978206, 0.95913181, 0.95877997,
        0.95824078, 0.95816479, 0.95762665, 0.9576256 , 0.9562422 ,
        0.95518281, 0.95245107, 0.94747819, 0.94710051, 0.92922344,
        0.90681738, 0.89071176, 0.88653306, 0.87486365, 0.12513635,
        0.11346694, 0.10928824, 0.07161788, 0.05409745, 0.05265032,
        0.05252181, 0.04754893, 0.04481719, 0.0437578 , 0.0423744 ,
        0.04237335, 0.04183521, 0.04175922, 0.04122003, 0.04086819,
        0.04021794, 0.03981484]))
y_test[names[-4]] # 獲取第一個目標屬性的實際值
y_test[names[-4]].value_counts()

0    170
1      2
Name: Hinselmann, dtype: int64
print(len(forest_y_score)) # 長度爲4表示目標屬性的數量
forest_y_score[0][0] # 第一個目標屬性的預測值
# forest_y_score[0].ravel()
4





array([0.95877997, 0.04122003])
label_binarize(y_test[names[-4]],classes=(0,1,2))
y_test[names[-4]]
144    0
774    0
263    0
788    0
847    0
      ..
156    0
597    0
624    0
50     0
823    0
Name: Hinselmann, Length: 172, dtype: int64
## 8. 畫圖(ROC圖)
plt.figure(figsize=(8, 6), facecolor='w')
plt.plot(forest_fpr1,forest_tpr1,c='r',lw=2,label=u'Hinselmann目標屬性,AUC=%.3f' % auc1)
plt.plot(forest_fpr2,forest_tpr2,c='b',lw=2,label=u'Schiller目標屬性,AUC=%.3f' % auc2)
plt.plot(forest_fpr3,forest_tpr3,c='g',lw=2,label=u'Citology目標屬性,AUC=%.3f' % auc3)
plt.plot(forest_fpr4,forest_tpr4,c='y',lw=2,label=u'Biopsy目標屬性,AUC=%.3f' % auc4)
plt.plot((0,1),(0,1),c='#a0a0a0',lw=2,ls='--')
plt.xlim(-0.001, 1.001)
plt.ylim(-0.001, 1.001)
plt.xticks(np.arange(0, 1.1, 0.1))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xlabel('False Positive Rate(FPR)', fontsize=16)
plt.ylabel('True Positive Rate(TPR)', fontsize=16)
plt.grid(b=True, ls=':')
plt.legend(loc='lower right', fancybox=True, framealpha=0.8, fontsize=12)
plt.title(u'隨機森林多目標屬性分類ROC曲線', fontsize=18)
plt.show()

在這裏插入圖片描述

#比較不同樹數目、樹最大深度的情況下隨機森林的正確率
#一般情況下,初始的隨機森林樹個數是100,深度1,如果需要我們再進行優化操作
x_train2,x_test2,y_train2,y_test2 = train_test_split(X, Y, test_size=0.5, random_state=0)
print ("訓練樣本數量%d,測試樣本數量:%d" % (x_train2.shape[0], x_test2.shape[0]))
## 比較
estimators = [1,50,100,500]
depth = [1,2,3,7,15]
x1, x2 = np.meshgrid(estimators, depth)
err_list = []
for es in estimators:
    es_list = []
    for d in depth:
        tf = RandomForestClassifier(n_estimators=es, criterion='gini', max_depth=d, max_features = None, random_state=0)
        tf.fit(x_train2, y_train2)
        st = tf.score(x_test2, y_test2)
        err = 1 - st
        es_list.append(err)
        print ("%d決策樹數目,%d最大深度,正確率:%.2f%%" % (es, d, st * 100))
    err_list.append(es_list)

    
## 畫圖
plt.figure(facecolor='w')
i = 0
colors = ['r','b','g','y']
lw = [1,2,4,3]
max_err = 0
min_err = 100
for es,l in zip(estimators,err_list):
    plt.plot(depth, l, c=colors[i], lw=lw[i], label=u'樹數目:%d' % es)
    max_err = max((max(l),max_err))
    min_err = min((min(l),min_err))
    i += 1
plt.xlabel(u'樹深度', fontsize=16)
plt.ylabel(u'錯誤率', fontsize=16)
plt.legend(loc='upper left', fancybox=True, framealpha=0.8, fontsize=12)
plt.grid(True)
plt.xlim(min(depth),max(depth))
plt.ylim(min_err * 0.99, max_err * 1.01)
plt.title(u'隨機森林中樹數目、深度和錯誤率的關係圖', fontsize=18)
plt.show()
訓練樣本數量429,測試樣本數量:429
1決策樹數目,1最大深度,正確率:86.48%
1決策樹數目,2最大深度,正確率:86.95%
1決策樹數目,3最大深度,正確率:84.62%
1決策樹數目,7最大深度,正確率:82.75%
1決策樹數目,15最大深度,正確率:78.09%
50決策樹數目,1最大深度,正確率:86.71%
50決策樹數目,2最大深度,正確率:86.48%
50決策樹數目,3最大深度,正確率:86.48%
50決策樹數目,7最大深度,正確率:86.25%
50決策樹數目,15最大深度,正確率:84.38%
100決策樹數目,1最大深度,正確率:86.95%
100決策樹數目,2最大深度,正確率:86.25%
100決策樹數目,3最大深度,正確率:86.48%
100決策樹數目,7最大深度,正確率:86.25%
100決策樹數目,15最大深度,正確率:85.08%
500決策樹數目,1最大深度,正確率:86.48%
500決策樹數目,2最大深度,正確率:86.48%
500決策樹數目,3最大深度,正確率:86.48%
500決策樹數目,7最大深度,正確率:86.25%
500決策樹數目,15最大深度,正確率:84.85%

在這裏插入圖片描述

# 隨機森林畫圖
# 方式三:直接生成圖片
from sklearn import tree
from IPython.display import Image  
import pydotplus
k = 0
for clf in forest.estimators_:
    dot_data = tree.export_graphviz(clf, out_file=None,  
                         filled=True, rounded=True,  
                         special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf("foress_tree_%d.pdf" % k)
    k += 1
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章