上面一篇博文我們使用決策樹建立模型預測泰坦尼克號倖存者的數量。但出現了過擬合現象,現在我們嘗試着優化模型參數,提高預測精度。
首先我們把上面一篇數據處理的代碼複製過來:
import pandas as pd;
def DataAnalyse():
data=pd.read_csv("./titanic/train.csv");
"""數據中有些對我們完全沒有用的信息,我們要去掉,比如:名字,票號,船艙號,樣本的ID號"""
data.drop(["PassengerId","Cabin","Ticket","Name","Embarked"],axis=1,inplace=True);#刪除了四個,我們還有7個特徵。其中一個是標籤
"""對性別進行編碼"""
data["Sex"]=(data['Sex']=='male').astype("int")
"""處理登船港口"""
#labels=data["Embarked"].unique().tolist()
#data["Embarked"]=data["Embarked"].apply(lambda n:labels.index(n))
"""數據中有一些沒有值得,我們全部補0"""
data=data.fillna(0)
Y_train=data["Survived"]
data.drop(["Survived"],axis=1,inplace=True)#在本身上操作。
X_train=data;
return X_train,Y_train;
from sklearn.model_selection import train_test_split
def datasplit(X,Y):
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2);
return x_train,x_test,y_train,y_test;
from sklearn.tree import DecisionTreeClassifier
接下來我們選用限制決策樹深度的方式來進行優化,首先我們知道,決策樹至少有兩層,但好像不太好限制最高有幾層。我們先設置最高有6*2=12層吧。
import numpy as np;
def max_depth(x_train, x_test, y_train, y_test):
depth=range(2,15);
trscore=[];
tsscore=[];
for i in depth:
clf=DecisionTreeClassifier(max_depth=i);
clf.fit(x_train,y_train);
train_score = clf.score(x_train, y_train)
trscore.append(train_score)
test_score = clf.score(x_test, y_test)
tsscore.append(test_score)
best_index=np.argmax(tsscore);
print(tsscore)
best_score=tsscore[best_index]
x=depth[best_index]
print("最優深度:",x,"得分:",best_score)
plt.figure();
plt.plot(depth,tsscore,".g--",label="測試")
plt.plot(depth,trscore,".r--",label="訓練")
plt.show()
if __name__ == '__main__':
X_train,Y_train=DataAnalyse();
x_train, x_test, y_train, y_test=datasplit(X_train,Y_train)
max_depth(x_train, x_test, y_train, y_test)
運行結果:
[0.776536312849162, 0.8212290502793296, 0.8212290502793296, 0.8547486033519553, 0.8379888268156425, 0.8044692737430168, 0.8044692737430168, 0.7877094972067039, 0.7932960893854749, 0.776536312849162, 0.7877094972067039, 0.7430167597765364, 0.7653631284916201]
最優深度: 5 得分: 0.8547486033519553
比上一篇的結果好一些了。
爲了更直觀的展示一下,我們嘗試着把它圖形化。
圖中紅線是訓練集評分,綠色是測試集評分。可以看出測試集在決策樹深度爲5的時候評分開始下降了。所以可以確定是5。另外需要指明的一點是我們的訓練集和測試集是隨機分的,所有每次運行的結果是不一樣的。
下面我們嘗試修改閾值來優化模型,嘗試提高預測精度。
from sklearn.model_selection import train_test_split
def datasplit(X,Y):
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2);
return x_train,x_test,y_train,y_test;
from sklearn.tree import DecisionTreeClassifier
import numpy as np;
import matplotlib.pyplot as plt;
def Value(x_train, x_test, y_train, y_test):
value=np.linspace(0,1,20)
trscore=[];
tsscore=[];
for i in value:
clf=DecisionTreeClassifier(criterion="gini",min_impurity_split=i);
clf.fit(x_train,y_train);
train_score = clf.score(x_train, y_train)
trscore.append(train_score)
test_score = clf.score(x_test, y_test)
tsscore.append(test_score)
best_index=np.argmax(tsscore);
print(tsscore)
best_score=tsscore[best_index]
x=value[best_index]
print("最優閾值:",x,"得分:",best_score)
plt.figure();
plt.plot(value,tsscore,".g--",label="測試")
plt.plot(value,trscore,".r--",label="訓練")
plt.show()
if __name__ == '__main__':
X_train,Y_train=DataAnalyse();
x_train, x_test, y_train, y_test=datasplit(X_train,Y_train)
Value(x_train, x_test, y_train, y_test)
運行結果:
[0.7653631284916201, 0.7597765363128491, 0.7821229050279329, 0.8100558659217877, 0.8379888268156425, 0.8156424581005587, 0.8212290502793296, 0.8212290502793296, 0.7877094972067039, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368, 0.5698324022346368]
最優閾值: 0.21052631578947367 得分: 0.8379888268156425