DT Sample

import matplotlib.pyplot as plt
from sklearn import tree as te
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

# 導入數據
def createDataSet():
dataSet = [['youth', 'no', 'no', 1, 'refuse'],
['youth', 'no', 'no', '2', 'refuse'],
['youth', 'yes', 'no', '2', 'agree'],
['youth', 'yes', 'yes', 1, 'agree'],
['youth', 'no', 'no', 1, 'refuse'],
['mid', 'no', 'no', 1, 'refuse'],
['mid', 'no', 'no', '2', 'refuse'],
['mid', 'yes', 'yes', '2', 'agree'],
['mid', 'no', 'yes', '3', 'agree'],
['mid', 'no', 'yes', '3', 'agree'],
['elder', 'no', 'yes', '3', 'agree'],
['elder', 'no', 'yes', '2', 'agree'],
['elder', 'yes', 'no', '2', 'agree'],
['elder', 'yes', 'no', '3', 'agree'],
['elder', 'no', 'no', 1, 'refuse'],
]
labels = ['age', 'working?', 'house?', 'credit_situation']
return dataSet, labels
# 測試代碼

def createDataSet2():
dataSet = [[1, 'Rain', 'Formal', 'Yes', 'Yes', 'No', 'Walk'],
[4, 'Snow', 'casual', 'No', 'No', 'Yes', 'Drive'],#
[7, 'Good', 'casual', 'No', 'No', 'No', 'Walk'],
[10, 'Rain', 'Formal', 'Yes', 'Yes', 'No', 'Walk'],
[5, 'Good', 'casual', 'Yes', 'No', 'Yes', 'Walk'],
[6, 'Good', 'Formal', 'No', 'No', 'No', 'Drive'],#
[8, 'Snow', 'Formal', 'No', 'Yes', 'Yes', 'Drive'],#
[1, 'Rain', 'Formal', 'Yes', 'No', 'Yes', 'Walk'],
[4, 'Snow', 'casual', 'Yes', 'Yes', 'Yes', 'Drive'],#
[7, 'Good', 'Formal', 'No', 'No', 'Yes', 'Drive'],#
     [10, 'Good', 'casual', '', 'No', 'Yes', 'Drive'],#
     [10, 'Good', 'casual', 'No', 'Yes', 'Yes', 'Drive'],#
]
labels = ['number','Weather', 'Cloth', 'Shopping', 'Weekend', 'Temp>90']
return dataSet, labels


def createDataSet3():
matrixevent_df = pd.read_csv("./MatrixEventAndLable/rawTFVector.txt", sep=' ', header = None)
matrix_arrs = matrixevent_df.as_matrix()
event_count_matrix = np.delete(matrix_arrs, matrix_arrs.shape[1] - 1, axis = 1)

label_df = pd.read_csv("./MatrixEventAndLable/mlabel.txt", sep=' ', header = None)
label_arrs = label_df.as_matrix()
#label_arrs = np.delete(label_arrs, label_arrs.shape[1] - 1, axis = 1)

labels = label_arrs[:,0]

dataSet = np.column_stack((event_count_matrix, labels))

labels_size = event_count_matrix.shape[1]

labels_ = []
for i in range(labels_size):
name = "E" + str(i)
labels_.append(name)

dataSet_ = [[]]*dataSet.shape[0]

for i in range(dataSet.shape[0]):
data_row = []
for j in range(dataSet.shape[1]):
data_row.append(dataSet[i][j])
dataSet_[i] = data_row

return dataSet_, labels_


def createDataSet4():

matrixevent_df = pd.read_csv("./MatrixEventAndLable/rawTFVector.txt", sep=' ', header = None)
matrix_arrs = matrixevent_df.as_matrix()
event_count_matrix = np.delete(matrix_arrs, matrix_arrs.shape[1] - 1, axis = 1)

label_df = pd.read_csv("./MatrixEventAndLable/mlabel.txt", sep=' ', header = None)
label_arrs = label_df.as_matrix()
label_arrs = np.delete(label_arrs, label_arrs.shape[1] - 1, axis = 1)


train_size = int(event_count_matrix.shape[0]*2/3)
dataSet_ = event_count_matrix[:train_size]
labels_ = label_arrs[:train_size]
testdata_ = event_count_matrix[train_size:]
testlabels_ = label_arrs[train_size:]
return dataSet_, labels_, testdata_, testlabels_

if __name__ == "__main__":
# myDat, labels = createDataSet3()
# myTree = tree.createTree(myDat, labels)

## when do createDataSet3, we will got following myTree
#myTree = {'E25': {0: 1, 3: {'E19': {0: {'E26': {0: {'E12': {0: {'E17': {0: {'E27': {0: {'E0': {0: 0, 2: 1}}, 3: 1}}, 1: {'E3': {0: 1, 2: 1, 3: 1, 4: 1, 6: 0}}, 2: 1}}, 3: 1}}, 1: {'E3': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 16: 0, 15: 0}}, 2: 1}}, 1: {'E2': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: {'E3': {0: 1, 3: 1, 4: {'E26': {0: 0, 1: 1}}, 5: 1}}, 10: 1, 203: 1}}, 2: 1, 3: 1}}, 4: {'E20': {0: {'E4': {3: 1, 4: 0}}, 1: {'E26': {0: {'E12': {0: 0, 3: 1}}, 1: 1}}, 3: {'E3': {0: 1, 16: 1, 2: 1, 3:1, 4: 1, 5: 1, 6: 1, 17: 1, 25: 0, 18: 1, 15: 0}}, 4: {'E3': {0: {'E2': {0: 1, 1: 1, 203: 0}}, 1: {'E12': {0: 0, 3: 1}}, 2: {'E22': {3: 1, 4: 0}}, 3: {'E2': {1: {'E4': {4: 0, 7: 1}}, 3: 1}}, 4: 1, 5: 1, 6: 1, 41: 0, 15: 1, 16: {'E22': {3: 1, 4: 0}}, 17: {'E22': {3: 1, 4: 0}}, 19: 1}}}}, 5: {'E20': {1: {'E17': {1: {'E26': {0: {'E12': {0: 0, 3: 1}}, 1: 1}}, 2: 1}}, 2: 1, 4: {'E3': {0: {'E22': {3: 1, 4: {'E19': {0: {'E4': {4: 0, 7: 1}}, 1: 1, 2: 1, 3: 1}}}}, 16: 1, 2: {'E22': {3: 1, 4: 0}}, 3: 1, 4: 1, 17: {'E22': {3: 1, 4: 0}}, 23: 0, 18: 0, 15: 0}}, 5: {'E19': {0: 1, 1: 1, 2: {'E26': {0: 0, 1: 1}}}}}}, 6: {'E1': {0: 1, 1: 1, 2: 1, 3: 0}}, 7: 1, 8: 1, 9: {'E26': {0: {'E0': {0: 0, 2: 1}}, 1: 1}}, 10: {'E2': {0: 1, 203: 0}}, 11: {'E15': {2: 1, 3: 1, 4: 0, 5: {'E6': {0: 0, 1: 1}}}}, 12: {'E1': {0: 0, 1: 1}}, 13: 1}}
#print(myTree)
#createPlot(myTree)



traindata, trainlabels, testdata, testlabels = createDataSet4()

clf = te.DecisionTreeClassifier()
clf = clf.fit(traindata, trainlabels)

    

# install graphviz first (https://graphviz.gitlab.io/_pages/Download/Download_source.html)

te.export_graphviz(clf, out_file="tree.dot") # dot -Tpng tree.dot -o tree.png
prediction = list(clf.predict(testdata))
assert len(prediction) == len(testlabels)

tn, fp, fn, tp = confusion_matrix(testlabels, prediction).ravel()
print("TP:%d FP:%d FN:%d TN:%d" %(tp, fp, fn, tn))
P = tp/(tp+fp)
R = tp/(tp+fn)
F1_SCORE = 2*(P*R)/(P+R)
print("P:%f R:%f F1_SCORE:%f" %(P,R,F1_SCORE))



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章