再談隨機森林---python實現

該代碼是我從github上找了半天才找到非常適合初學者的,在此感謝原作者,從你的代碼中我學到了很多。

from __future__ import division
import pandas as pd
import copy
import random
import math

# 最後一個屬性還不能將樣本完全分開,此時數量最多的label被選爲最終類別
def majorClass(classList):
    classDict = {}
    for cls in classList:
        classDict[cls] = classDict.get(cls, 0) + 1
    sortClass = sorted(classDict.items(), key=lambda item: item[1])
    return sortClass[-1][0]

# 計算基尼係數
def calcGini(dataSet):
    labelCounts = {}
    # 給所有可能分類創建字典
    for dt in dataSet:
        currentLabel = dt[-1]
        labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1
    Gini = 1
    for key in labelCounts:
        prob = labelCounts[key] / len(dataSet)
        Gini -= prob * prob
    return Gini

# 對連續變量劃分數據集
def splitDataSet(dataSet, featIndex, value):
    leftData, rightData = [], []
    for dt in dataSet:
        if dt[featIndex] <= value:
            leftData.append(dt)
        else:
            rightData.append(dt)
    return leftData, rightData

# 選擇最好的數據集劃分方式
def chooseBestFeature(dataSet):
    bestGini = 1
    bestFeatureIndex = -1
    bestSplitValue = None
    # 第i個特徵
    for i in range(len(dataSet[0]) - 1):
        featList = [dt[i] for dt in dataSet]
        # 產生候選劃分點
        sortfeatList = sorted(list(set(featList)))
        splitList = []
        for j in range(len(sortfeatList) - 1):
            splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2)

        # 第j個候選劃分點,記錄最佳劃分點
        for splitValue in splitList:
            newGini = 0
            subDataSet0, subDataSet1 = splitDataSet(dataSet, i, splitValue)
            newGini += len(subDataSet0) / len(dataSet) * calcGini(subDataSet0)
            newGini += len(subDataSet1) / len(dataSet) * calcGini(subDataSet1)
            if newGini < bestGini:
                bestGini = newGini
                bestFeatureIndex = i
                bestSplitValue = splitValue
    return bestFeatureIndex, bestSplitValue

# 去掉第i個屬性,生成新的數據集
def splitData(dataSet, featIndex, features, value):
    newFeatures = copy.deepcopy(features)
    newFeatures.remove(features[featIndex])
    leftData, rightData = [], []
    for dt in dataSet:
        temp = []
        temp.extend(dt[:featIndex])
        temp.extend(dt[featIndex + 1:])
        #實驗時,有時會出現value=NONE的情況
        #好像引起這個問題有兩個原因
        #1.chooseBestFeature有問題 (20%)
        #2.你的數據集有問題(80%)
        # 括號是出問題的可能性
        if dt[featIndex] <= value:
            leftData.append(temp)
        else:
            rightData.append(temp)
    return newFeatures, leftData, rightData

# 建立決策樹
def createTree(dataSet, features):
    classList = [dt[-1] for dt in dataSet]
    # label一樣,全部分到一邊
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    # 最後一個特徵還不能把所有樣本分到一邊,則選數量最多的label
    if len(features) == 1:
        return majorClass(classList)
    bestFeatureIndex, bestSplitValue = chooseBestFeature(dataSet)
    bestFeature = features[bestFeatureIndex]
    # 生成新的去掉bestFeature特徵的數據集
    newFeatures, leftData, rightData = splitData(dataSet, bestFeatureIndex, features, bestSplitValue)
    # 左右兩顆子樹,左邊小於等於最佳劃分點,右邊大於最佳劃分點
    myTree = {bestFeature: {'<' + str(bestSplitValue): {}, '>' + str(bestSplitValue): {}}}
    myTree[bestFeature]['<' + str(bestSplitValue)] = createTree(leftData, newFeatures)
    myTree[bestFeature]['>' + str(bestSplitValue)] = createTree(rightData, newFeatures)
    return myTree

# 用生成的決策樹對測試樣本進行分類
def treeClassify(decisionTree, featureLabel, testDataSet):
    firstFeature = decisionTree.keys()[0]
    secondFeatDict = decisionTree[firstFeature]
    splitValue = float(secondFeatDict.keys()[0][1:])
    # secondFeatDict.keys()[0]是'<'+str(bestsplitValue)
    #所以索引應該從1開始去掉之前的'<'或者‘>’
    featureIndex = featureLabel.index(firstFeature)
    if testDataSet[featureIndex] <= splitValue:
        valueOfFeat = secondFeatDict['<' + str(splitValue)]
    else:
        valueOfFeat = secondFeatDict['>' + str(splitValue)]
    if isinstance(valueOfFeat, dict):
        pred_label = treeClassify(valueOfFeat, featureLabel, testDataSet)
    else:
        pred_label = valueOfFeat
    return pred_label

# 隨機抽取樣本,樣本數量與原訓練樣本集一樣,維度爲sqrt(m-1)
def baggingDataSet(dataSet):
    n, m = dataSet.shape
    features = random.sample(dataSet.columns.values[:-1], int(math.sqrt(m - 1)))
    features.append(dataSet.columns.values[-1])
    #不好疑惑這兒就是得添加最後一列,你可能想說不是應該抽取特徵數的開方個特徵不就行了嗎?
    #其實這個函數是爲了構造一個新的數據集,所以必須加上最後一列(類別列)。
    rows = [random.randint(0, n-1) for _ in range(n)]
    trainData = dataSet.iloc[rows][features]
    #利用pandas.DataFrame的iloc方法取值
    return trainData.values.tolist(), features

def testWine():
    df = pd.read_csv('wine.txt', header=None)
    labels = df.columns.values.tolist()
    df = df[df[labels[-1]] != 3]
    # 生成多棵決策樹,放到一個list裏邊
    treeCounts = 10
    treeList = []
    for i in range(treeCounts):
        baggingData, bagginglabels = baggingDataSet(df)
        decisionTree = createTree(baggingData, bagginglabels)
        treeList.append(decisionTree)
    print treeList
    # 對測試樣本分類
    labelPred = []
    for tree in treeList:
        testData = [12, 0.92, 2, 19, 86, 2.42, 2.26, 0.3, 1.43, 2.5, 1.38, 3.12, 278]
        label = treeClassify(tree, labels[:-1], testData)
        labelPred.append(label)
    # 投票選擇最終類別(其實也就是majorityclass())
    labelDict = {}
    for label in labelPred:
        labelDict[label] = labelDict.get(label, 0) + 1
    sortClass = sorted(labelDict.items(), key=lambda item: item[1])
    print "The predicted label is: {}".format(sortClass[-1][0])
testWine()

該算法解決了上一篇關於隨機森林中的兩個問題。
如果拓展下加上重複做十次又該怎麼做呢。
注意:這不是十折交叉驗證,因爲在十交叉驗證中,塊都是分好的。也就是說已經被分爲測試集的樣本將不會再分成測試集。我做的時候忘記了這點只想着循環十次就是十-交叉驗證,做完才發現自己的錯誤。。。。

在這給大家一個參考的例子,我自己將這兩篇關於隨機森林的代碼綜合了下 ,能力有限,湊合着看吧。

import scipy.io as sio
import numpy as np
import pandas as pd
import copy
import random
import math
from sklearn.cross_validation import train_test_split

def load_data(filename):
    load_data = sio.loadmat(filename)
    PET = load_data['PET']
    MRI = load_data['MRI']
    #MRI = np.round(MRI, decimals=6)
    GND4 = load_data['GND4']
    GND3 = load_data['GND3']
    CSF = load_data['CSF']
    dataset = np.append(PET,MRI,axis=1)
    dataset = np.append(dataset,CSF,axis = 1)
    dataset = np.append(dataset,GND3,axis = 1)
    df = pd.DataFrame(dataset)
    labels = df.columns.values.tolist()
    df = df[df[labels[-1]] != 2] # only    left 1 and 3
    dataset = df.iloc[:,0:189]
    label = df.iloc[:,189]
       dataset = np.array(dataset)
    label  = np.array(label)
    label = label.tolist()
    labels = []
    for i in label:
        g = [i]
        labels.append(g)
        labels = np.array(labels)    
#print(GND3[0:150,])
#retudata = []
#for i in range(dataset.shape[0]):
    #retudata.append(dataset[i].tolist())
    return dataset,labels

def majorClass(classList):
    classDict = {}
    for cls in classList:
        classDict[cls] = classDict.get(cls, 0) + 1
    sortClass = sorted(classDict.items(), key=lambda item: item[1])
    return sortClass[-1][0]


def calcGini(dataSet):
    labelCounts = {}

    for dt in dataSet:
        currentLabel = dt[-1]
        labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1
    Gini = 1
    for key in labelCounts:
        prob = labelCounts[key] / len(dataSet)
        Gini -= prob * prob
    return Gini

def splitDataSet(dataSet, featIndex, value):
    leftData, rightData = [], []
    for dt in dataSet:
        if dt[featIndex] <= value:
            leftData.append(dt)
        else:
            rightData.append(dt)
    return leftData, rightData

def chooseBestFeature(dataSet):
    bestGini = 999
    bestFeatureIndex = -1
    bestSplitValue = None
        for i in range(len(dataSet[0]) - 1):
        featList = [dt[i] for dt in dataSet]
        sortfeatList = sorted(list(set(featList)))
        splitList = []
        for j in range(len(sortfeatList) - 1):
            #splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2)

        for splitValue in sortfeatList:
            newGini = 0
            subDataSet0, subDataSet1 = splitDataSet(dataSet, i, splitValue)
            newGini += len(subDataSet0) / len(dataSet) * calcGini(subDataSet0)
            newGini += len(subDataSet1) / len(dataSet) * calcGini(subDataSet1)
            if newGini < bestGini:
                bestGini = newGini
                bestFeatureIndex = i

                bestSplitValue = splitValue
    return bestFeatureIndex, bestSplitValue

def splitData(dataSet, featIndex, features, value):
    newFeatures = copy.deepcopy(features)
    newFeatures.remove(features[featIndex])
    leftData, rightData = [], []
    for dt in dataSet:
        temp = []
        temp.extend(dt[:featIndex])
        temp.extend(dt[featIndex + 1:])

        if value == None or dt[featIndex] <= value:
            leftData.append(temp)
        else:
            rightData.append(temp)
    return newFeatures, leftData, rightData

def createTree(dataSet, features):

    classList = [dt[-1] for dt in dataSet]

    if classList.count(classList[0]) == len(classList):
        return classList[0]

    if len(features) == 1:
        return majorClass(classList)
    bestFeatureIndex, bestSplitValue = chooseBestFeature(dataSet)
    bestFeature = features[bestFeatureIndex]

    newFeatures, leftData, rightData = splitData(dataSet, bestFeatureIndex, features, bestSplitValue)

    myTree = {bestFeature: {'<' + str(bestSplitValue): {}, '>' + str(bestSplitValue): {}}}

    myTree[bestFeature]['<' + str(bestSplitValue)] = createTree(leftData, newFeatures)
    myTree[bestFeature]['>' + str(bestSplitValue)] = createTree(rightData, newFeatures)
    return myTree


def treeClassify(decisionTree, featureLabel, testDataSet):
    firstFeature = list(decisionTree.keys())[0]
    secondFeatDict = decisionTree[firstFeature]
    splitValue = float(list(secondFeatDict.keys())[0][1:])
    featureIndex = featureLabel.index(firstFeature)
    if testDataSet[featureIndex] <= splitValue:
        valueOfFeat = secondFeatDict['<' + str(splitValue)]
    else:
        valueOfFeat = secondFeatDict['>' + str(splitValue)]
    if isinstance(valueOfFeat, dict):
        pred_label = treeClassify(valueOfFeat, featureLabel, testDataSet)
    else:
        pred_label = valueOfFeat
    return pred_label


def baggingDataSet(dataSet):
    n, m = dataSet.shape

    features = random.sample(list(dataSet.columns.values[:-1]), int(math.sqrt(m - 1)) + 1)

    features.append(dataSet.columns.values[-1])

    rows = [random.randint(0, n-1) for _ in range(n)]
    trainData = dataSet.iloc[rows][features]
    return trainData.values.tolist(), features




#def majorityCnt(classList):
    #classCount = {}
    #for vote in classList:
        #if vote not in classCount.keys():
            #classCount[vote] = 0
        #classCount[vote] += 1
    #sortClass = sorted(labelDict.items(), key=lambda item: item[1])
    #return sortClass[-1][0]


def ad_vs_nc(train_set,test_set,n_trees):
    df = pd.DataFrame(train_set) # this is train dataset
    labels = df.columns.values.tolist()

    df = df[df[labels[-1]] != 2]

    treeCounts = n_trees
    treeList = []
    for i in range(treeCounts): #get forest
        baggingData, bagginglabels = baggingDataSet(df)
        decisionTree = createTree(baggingData, bagginglabels)
        treeList.append(decisionTree)
    print( treeList,'treelist')
    predictions = []
    for row in test_set:
        labelPred = []
        for tree in treeList:
            testData = row
            label = treeClassify(tree, labels[:-1], testData)
            labelPred.append(label)
        predictions.append(majorClass(labelPred))
    return predictions


def change_y_test(y_test):
    actualList = []
    for i in range(len(y_test)):
        #if y_test[i][0] != 2:
            actualList.extend(y_test[i])
    return actualList

def accuracy_cal(actual, predicted):  
    print(actual,'actual')
    print(predicted,'predicted')
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def evaluate_algorithm():
    filename = 'f:\\ADNI202.mat'
    data,labels = load_data(filename)

    scores = []
    n_trees = 100
    for i in range(10):
        x_train,x_test,y_train,y_test = train_test_split(data,labels,test_size = 0.1,random_state = 0)
        #這是sklearn的方法,自動分測試集和訓練集 x_train是測試樣本的特徵值
        #y_train是測試樣本的類別值


        #train_set = np.append(x_train,y_train,axis = 1)
        train_set = np.append(x_train,y_train,axis = 1)
        train_set = pd.DataFrame(train_set)
        test_set = x_test.tolist()
        actual = change_y_test(y_test)
        predicted = ad_vs_nc(train_set,test_set,n_trees)
        accuracy = accuracy_cal(actual,predicted)
        scores.append(accuracy)

    print("trees {0}".format(n_trees))
    print('scores{0}'.format(scores))
    print('mean accuracy{0}'.format(sum(scores)/float(len(scores))))      

evaluate_algorithm()        


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章