首先需要引入需要的类
from sklearn.tree import DecisionTreeClassifier,export_graphviz from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
然后写入函数tree_titanic()
def tree_titanic(): path="E:\data\\titanic.csv" #注意此处‘\t’会被认为是特殊字符,需要加两个‘\’ titanic=pd.read_csv(path) print("type(titanic):",type(titanic)) #1.处理特征值和目标值 x=titanic[["pclass","age","sex"]]#注意此处DataFrame选用多个列的时候,要用两个中括号“[]” print(x) y=titanic["survived"] print(y) # print("x1:",x,type(x)) #2.特征值处理 # (1)缺失值处理 x["age"].fillna(x["age"].mean(), inplace=True)#注意此处我刚开始时犯了一个错误,好长时间才找到,是mean(),而不是mean # print("x2:", x) # #(2)转化成字典 x=x.to_dict(orient="records") print("x3:", x) # #3.数据集划分 x_train, x_test, y_train, y_test=train_test_split(x, y, random_state=22) # print(x_train) # #4.字典特征抽取 transfer=DictVectorizer() x_train= transfer.fit_transform(x_train) x_test=transfer.transform(x_test) #5.决策树预估器 estimator=DecisionTreeClassifier(criterion="entropy") estimator.fit(x_train,y_train) #6.模型评估 #(1)方法1:直接比对真实值和预测值 y_predict = estimator.predict(x_test) print("y_predict:\n", y_predict) print("直接比对真实值和预测值:\n", y_test == y_predict) # (2).计算准确率 score = estimator.score(x_test, y_test) print("准确率为:\n", score) #7.可视化决策树 export_graphviz(estimator, out_file="titanic_tree.dot", feature_names=transfer.get_feature_names()) return None
结果为: