# python機器學習之用決策樹處理泰坦尼克號數據

from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

def tree_titanic():
path="E:\data\\titanic.csv" #注意此處‘\t’會被認爲是特殊字符，需要加兩個‘\’
print("type(titanic):",type(titanic))
#1.處理特徵值和目標值
x=titanic[["pclass","age","sex"]]#注意此處DataFrame選用多個列的時候，要用兩個中括號“[]”
print(x)
y=titanic["survived"]
print(y)
# print("x1:",x,type(x))
#2.特徵值處理
# （1）缺失值處理
x["age"].fillna(x["age"].mean(), inplace=True)#注意此處我剛開始時犯了一個錯誤，好長時間才找到，是mean(),而不是mean
#  print("x2:", x)
#  #(2)轉化成字典
x=x.to_dict(orient="records")
print("x3:", x)
# #3.數據集劃分
x_train, x_test, y_train, y_test=train_test_split(x, y, random_state=22)
# print(x_train)
# #4.字典特徵抽取
transfer=DictVectorizer()
x_train= transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)
#5.決策樹預估器
estimator=DecisionTreeClassifier(criterion="entropy")
estimator.fit(x_train,y_train)
#6.模型評估
#（1）方法1：直接比對真實值和預測值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比對真實值和預測值：\n", y_test == y_predict)
# （2）.計算準確率
score = estimator.score(x_test, y_test)
print("準確率爲：\n", score)
#7.可視化決策樹
export_graphviz(estimator, out_file="titanic_tree.dot", feature_names=transfer.get_feature_names())
return  None