首先需要引入需要的類
from sklearn.tree import DecisionTreeClassifier,export_graphviz from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
然後寫入函數tree_titanic()
def tree_titanic(): path="E:\data\\titanic.csv" #注意此處‘\t’會被認爲是特殊字符,需要加兩個‘\’ titanic=pd.read_csv(path) print("type(titanic):",type(titanic)) #1.處理特徵值和目標值 x=titanic[["pclass","age","sex"]]#注意此處DataFrame選用多個列的時候,要用兩個中括號“[]” print(x) y=titanic["survived"] print(y) # print("x1:",x,type(x)) #2.特徵值處理 # (1)缺失值處理 x["age"].fillna(x["age"].mean(), inplace=True)#注意此處我剛開始時犯了一個錯誤,好長時間才找到,是mean(),而不是mean # print("x2:", x) # #(2)轉化成字典 x=x.to_dict(orient="records") print("x3:", x) # #3.數據集劃分 x_train, x_test, y_train, y_test=train_test_split(x, y, random_state=22) # print(x_train) # #4.字典特徵抽取 transfer=DictVectorizer() x_train= transfer.fit_transform(x_train) x_test=transfer.transform(x_test) #5.決策樹預估器 estimator=DecisionTreeClassifier(criterion="entropy") estimator.fit(x_train,y_train) #6.模型評估 #(1)方法1:直接比對真實值和預測值 y_predict = estimator.predict(x_test) print("y_predict:\n", y_predict) print("直接比對真實值和預測值:\n", y_test == y_predict) # (2).計算準確率 score = estimator.score(x_test, y_test) print("準確率爲:\n", score) #7.可視化決策樹 export_graphviz(estimator, out_file="titanic_tree.dot", feature_names=transfer.get_feature_names()) return None
結果爲: