機器學習--決策樹基本實現

原創

2020-06-02 17:45

# _*_ coding:utf-8 _*_
import numpy as np
import math
import operator
import sys
import pickle
def createDataSet():
    """
    outlook->  0: sunny | 1: overcast | 2: rain
    temperature-> 0: hot | 1: mild | 2: cool
    humidity-> 0: high | 1: normal
    windy-> 0: false | 1: true
    """
    dataSet = [[0, 0, 0, 0, 'N'],
                       [0, 0, 0, 1, 'N'],
                       [1, 0, 0, 0, 'Y'],
                       [2, 1, 0, 0, 'Y'],
                       [2, 2, 1, 0, 'Y'],
                       [2, 2, 1, 1, 'N'],
                       [1, 2, 1, 1, 'Y']]
    labels = ['outlook', 'temperature', 'humidity', 'windy']
    return dataSet, labels

def creatDataSet1():
    # 數據集
    dataSet=[[0, 0, 0, 0, 'no'],
            [0, 0, 0, 1, 'no'],
            [0, 1, 0, 1, 'yes'],
            [0, 1, 1, 0, 'yes'],
            [0, 0, 0, 0, 'no'],
            [1, 0, 0, 0, 'no'],
            [1, 0, 0, 1, 'no'],
            [1, 1, 1, 1, 'yes'],
            [1, 0, 1, 2, 'yes'],
            [1, 0, 1, 2, 'yes'],
            [2, 0, 1, 2, 'yes'],
            [2, 0, 1, 1, 'yes'],
            [2, 1, 0, 1, 'yes'],
            [2, 1, 0, 2, 'yes'],
            [2, 0, 0, 0, 'no']]
    #分類屬性
    labels=['年齡','有工作','有自己的房子','信貸情況']
    #返回數據集和分類屬性
    return dataSet,labels



def cal_entropy(dataset):
    #numpy中
    # samples = dataset[:,-1]
    numbers = len(dataset)
    samples = [sample[-1] for sample in dataset]
    label_num = {}
    for i in samples:
        if  i not in label_num.keys():
            label_num[i]=0
        label_num[i]+=1
    entropy = 0.0
    for k,v in label_num.items():
        entropy-=(v/numbers)*(math.log(v/numbers,2))
    return entropy

def selectBestFeature(dataset):
    print(dataset)
    best_index = -1
    baseentropy = cal_entropy(dataset)

    feat_dic = {}
    best_feat = 0.0
    for i in range(len(dataset[0])-1):
        samples = set([sample[i] for sample in dataset])
        feat_entropy = 0.0
        for value in samples:
            subdataset = splitdataset(dataset,i,value)
            feat_entropy += (len(subdataset)/len(dataset))*cal_entropy(subdataset)
        feat_dic[i] = baseentropy-feat_entropy
        if (feat_dic[i]>best_feat):
            best_feat = feat_dic[i]
            best_index = i
    print(best_index,best_feat)
    return best_index


def splitdataset(dataset,i,value):
    newdataset = []
    for data in dataset:
        if data[i]==value:
            tmplist = data[:i]
            tmplist.extend(data[i+1:])
            newdataset.append(tmplist)
     #       newdataset.append(list(np.delete(data,i)))
    return newdataset




def createTree(dataset,labels):
    samples = [sample[-1] for sample in dataset]
    if len(samples)==samples.count(samples[0]):
        return samples[0]
    best_index = selectBestFeature(dataset)
    print(best_index)
    best_label = labels[best_index]
    mytree = {best_label:{}}

    del (labels[best_index])

    values = set([sample[best_index] for sample in dataset])
 #   print(values)
    for val in values:
   #     print(best_index,val)
        data = splitdataset(dataset,best_index,val)
  #      print(data)
        mytree[best_label][val] = createTree(splitdataset(dataset,best_index,val),labels)
    print(mytree)
    return mytree

def classify(inputTree,featLabel,testVec):
    firstStr = next(iter(inputTree))
    secondDict = inputTree(firstStr)

    featIndex = featLabel.index(firstStr)

    for key in secondDict.keys():
        if testVec[featIndex] ==key:
            if type(secondDict[key]).__name__=='dict':
                classLabel = classify(secondDict[key],featLabel,testVec)
            else:classLabel=secondDict[key]
    return classLabel
# def storeTree(input,filename):
#     with open(filename,'wb') as fw:
#         picket.dump(input,fw)
if __name__ == '__main__':
    dataset,labels = creatDataSet1()
    mytree = createTree(dataset,labels)
    testVec = [0,1]
    result = classify(mytree,testVec)

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

機器學習--決策樹基本實現

這個網絡爬蟲代碼，拿到數據之後如何存到csv文件中去？

即刻放大鏡。跟隨鼠標，屏幕任意位置放大

【面試準備】【SQL】數據庫有哪些約束？

.NET開源強大、易於使用的緩存框架 - FusionCache

面試，有時候是個運氣活

數據分析(4)--Pandas+DataFrame

決策樹--隨機森林（bagging）

機器學習--lightgbm

機器學習--決策樹基本實現

深度學習(4)--手寫數字mnist實現

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結