该代码是我从github上找了半天才找到非常适合初学者的,在此感谢原作者,从你的代码中我学到了很多。
from __future__ import division
import pandas as pd
import copy
import random
import math
# 最后一个属性还不能将样本完全分开,此时数量最多的label被选为最终类别
def majorClass(classList):
classDict = {}
for cls in classList:
classDict[cls] = classDict.get(cls, 0) + 1
sortClass = sorted(classDict.items(), key=lambda item: item[1])
return sortClass[-1][0]
# 计算基尼系数
def calcGini(dataSet):
labelCounts = {}
# 给所有可能分类创建字典
for dt in dataSet:
currentLabel = dt[-1]
labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1
Gini = 1
for key in labelCounts:
prob = labelCounts[key] / len(dataSet)
Gini -= prob * prob
return Gini
# 对连续变量划分数据集
def splitDataSet(dataSet, featIndex, value):
leftData, rightData = [], []
for dt in dataSet:
if dt[featIndex] <= value:
leftData.append(dt)
else:
rightData.append(dt)
return leftData, rightData
# 选择最好的数据集划分方式
def chooseBestFeature(dataSet):
bestGini = 1
bestFeatureIndex = -1
bestSplitValue = None
# 第i个特征
for i in range(len(dataSet[0]) - 1):
featList = [dt[i] for dt in dataSet]
# 产生候选划分点
sortfeatList = sorted(list(set(featList)))
splitList = []
for j in range(len(sortfeatList) - 1):
splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2)
# 第j个候选划分点,记录最佳划分点
for splitValue in splitList:
newGini = 0
subDataSet0, subDataSet1 = splitDataSet(dataSet, i, splitValue)
newGini += len(subDataSet0) / len(dataSet) * calcGini(subDataSet0)
newGini += len(subDataSet1) / len(dataSet) * calcGini(subDataSet1)
if newGini < bestGini:
bestGini = newGini
bestFeatureIndex = i
bestSplitValue = splitValue
return bestFeatureIndex, bestSplitValue
# 去掉第i个属性,生成新的数据集
def splitData(dataSet, featIndex, features, value):
newFeatures = copy.deepcopy(features)
newFeatures.remove(features[featIndex])
leftData, rightData = [], []
for dt in dataSet:
temp = []
temp.extend(dt[:featIndex])
temp.extend(dt[featIndex + 1:])
#实验时,有时会出现value=NONE的情况
#好像引起这个问题有两个原因
#1.chooseBestFeature有问题 (20%)
#2.你的数据集有问题(80%)
# 括号是出问题的可能性
if dt[featIndex] <= value:
leftData.append(temp)
else:
rightData.append(temp)
return newFeatures, leftData, rightData
# 建立决策树
def createTree(dataSet, features):
classList = [dt[-1] for dt in dataSet]
# label一样,全部分到一边
if classList.count(classList[0]) == len(classList):
return classList[0]
# 最后一个特征还不能把所有样本分到一边,则选数量最多的label
if len(features) == 1:
return majorClass(classList)
bestFeatureIndex, bestSplitValue = chooseBestFeature(dataSet)
bestFeature = features[bestFeatureIndex]
# 生成新的去掉bestFeature特征的数据集
newFeatures, leftData, rightData = splitData(dataSet, bestFeatureIndex, features, bestSplitValue)
# 左右两颗子树,左边小于等于最佳划分点,右边大于最佳划分点
myTree = {bestFeature: {'<' + str(bestSplitValue): {}, '>' + str(bestSplitValue): {}}}
myTree[bestFeature]['<' + str(bestSplitValue)] = createTree(leftData, newFeatures)
myTree[bestFeature]['>' + str(bestSplitValue)] = createTree(rightData, newFeatures)
return myTree
# 用生成的决策树对测试样本进行分类
def treeClassify(decisionTree, featureLabel, testDataSet):
firstFeature = decisionTree.keys()[0]
secondFeatDict = decisionTree[firstFeature]
splitValue = float(secondFeatDict.keys()[0][1:])
# secondFeatDict.keys()[0]是'<'+str(bestsplitValue)
#所以索引应该从1开始去掉之前的'<'或者‘>’
featureIndex = featureLabel.index(firstFeature)
if testDataSet[featureIndex] <= splitValue:
valueOfFeat = secondFeatDict['<' + str(splitValue)]
else:
valueOfFeat = secondFeatDict['>' + str(splitValue)]
if isinstance(valueOfFeat, dict):
pred_label = treeClassify(valueOfFeat, featureLabel, testDataSet)
else:
pred_label = valueOfFeat
return pred_label
# 随机抽取样本,样本数量与原训练样本集一样,维度为sqrt(m-1)
def baggingDataSet(dataSet):
n, m = dataSet.shape
features = random.sample(dataSet.columns.values[:-1], int(math.sqrt(m - 1)))
features.append(dataSet.columns.values[-1])
#不好疑惑这儿就是得添加最后一列,你可能想说不是应该抽取特征数的开方个特征不就行了吗?
#其实这个函数是为了构造一个新的数据集,所以必须加上最后一列(类别列)。
rows = [random.randint(0, n-1) for _ in range(n)]
trainData = dataSet.iloc[rows][features]
#利用pandas.DataFrame的iloc方法取值
return trainData.values.tolist(), features
def testWine():
df = pd.read_csv('wine.txt', header=None)
labels = df.columns.values.tolist()
df = df[df[labels[-1]] != 3]
# 生成多棵决策树,放到一个list里边
treeCounts = 10
treeList = []
for i in range(treeCounts):
baggingData, bagginglabels = baggingDataSet(df)
decisionTree = createTree(baggingData, bagginglabels)
treeList.append(decisionTree)
print treeList
# 对测试样本分类
labelPred = []
for tree in treeList:
testData = [12, 0.92, 2, 19, 86, 2.42, 2.26, 0.3, 1.43, 2.5, 1.38, 3.12, 278]
label = treeClassify(tree, labels[:-1], testData)
labelPred.append(label)
# 投票选择最终类别(其实也就是majorityclass())
labelDict = {}
for label in labelPred:
labelDict[label] = labelDict.get(label, 0) + 1
sortClass = sorted(labelDict.items(), key=lambda item: item[1])
print "The predicted label is: {}".format(sortClass[-1][0])
testWine()
该算法解决了上一篇关于随机森林中的两个问题。
如果拓展下加上重复做十次又该怎么做呢。
注意:这不是十折交叉验证,因为在十交叉验证中,块都是分好的。也就是说已经被分为测试集的样本将不会再分成测试集。我做的时候忘记了这点只想着循环十次就是十-交叉验证,做完才发现自己的错误。。。。
在这给大家一个参考的例子,我自己将这两篇关于随机森林的代码综合了下 ,能力有限,凑合着看吧。
import scipy.io as sio
import numpy as np
import pandas as pd
import copy
import random
import math
from sklearn.cross_validation import train_test_split
def load_data(filename):
load_data = sio.loadmat(filename)
PET = load_data['PET']
MRI = load_data['MRI']
#MRI = np.round(MRI, decimals=6)
GND4 = load_data['GND4']
GND3 = load_data['GND3']
CSF = load_data['CSF']
dataset = np.append(PET,MRI,axis=1)
dataset = np.append(dataset,CSF,axis = 1)
dataset = np.append(dataset,GND3,axis = 1)
df = pd.DataFrame(dataset)
labels = df.columns.values.tolist()
df = df[df[labels[-1]] != 2] # only left 1 and 3
dataset = df.iloc[:,0:189]
label = df.iloc[:,189]
dataset = np.array(dataset)
label = np.array(label)
label = label.tolist()
labels = []
for i in label:
g = [i]
labels.append(g)
labels = np.array(labels)
#print(GND3[0:150,])
#retudata = []
#for i in range(dataset.shape[0]):
#retudata.append(dataset[i].tolist())
return dataset,labels
def majorClass(classList):
classDict = {}
for cls in classList:
classDict[cls] = classDict.get(cls, 0) + 1
sortClass = sorted(classDict.items(), key=lambda item: item[1])
return sortClass[-1][0]
def calcGini(dataSet):
labelCounts = {}
for dt in dataSet:
currentLabel = dt[-1]
labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1
Gini = 1
for key in labelCounts:
prob = labelCounts[key] / len(dataSet)
Gini -= prob * prob
return Gini
def splitDataSet(dataSet, featIndex, value):
leftData, rightData = [], []
for dt in dataSet:
if dt[featIndex] <= value:
leftData.append(dt)
else:
rightData.append(dt)
return leftData, rightData
def chooseBestFeature(dataSet):
bestGini = 999
bestFeatureIndex = -1
bestSplitValue = None
for i in range(len(dataSet[0]) - 1):
featList = [dt[i] for dt in dataSet]
sortfeatList = sorted(list(set(featList)))
splitList = []
for j in range(len(sortfeatList) - 1):
#splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2)
for splitValue in sortfeatList:
newGini = 0
subDataSet0, subDataSet1 = splitDataSet(dataSet, i, splitValue)
newGini += len(subDataSet0) / len(dataSet) * calcGini(subDataSet0)
newGini += len(subDataSet1) / len(dataSet) * calcGini(subDataSet1)
if newGini < bestGini:
bestGini = newGini
bestFeatureIndex = i
bestSplitValue = splitValue
return bestFeatureIndex, bestSplitValue
def splitData(dataSet, featIndex, features, value):
newFeatures = copy.deepcopy(features)
newFeatures.remove(features[featIndex])
leftData, rightData = [], []
for dt in dataSet:
temp = []
temp.extend(dt[:featIndex])
temp.extend(dt[featIndex + 1:])
if value == None or dt[featIndex] <= value:
leftData.append(temp)
else:
rightData.append(temp)
return newFeatures, leftData, rightData
def createTree(dataSet, features):
classList = [dt[-1] for dt in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(features) == 1:
return majorClass(classList)
bestFeatureIndex, bestSplitValue = chooseBestFeature(dataSet)
bestFeature = features[bestFeatureIndex]
newFeatures, leftData, rightData = splitData(dataSet, bestFeatureIndex, features, bestSplitValue)
myTree = {bestFeature: {'<' + str(bestSplitValue): {}, '>' + str(bestSplitValue): {}}}
myTree[bestFeature]['<' + str(bestSplitValue)] = createTree(leftData, newFeatures)
myTree[bestFeature]['>' + str(bestSplitValue)] = createTree(rightData, newFeatures)
return myTree
def treeClassify(decisionTree, featureLabel, testDataSet):
firstFeature = list(decisionTree.keys())[0]
secondFeatDict = decisionTree[firstFeature]
splitValue = float(list(secondFeatDict.keys())[0][1:])
featureIndex = featureLabel.index(firstFeature)
if testDataSet[featureIndex] <= splitValue:
valueOfFeat = secondFeatDict['<' + str(splitValue)]
else:
valueOfFeat = secondFeatDict['>' + str(splitValue)]
if isinstance(valueOfFeat, dict):
pred_label = treeClassify(valueOfFeat, featureLabel, testDataSet)
else:
pred_label = valueOfFeat
return pred_label
def baggingDataSet(dataSet):
n, m = dataSet.shape
features = random.sample(list(dataSet.columns.values[:-1]), int(math.sqrt(m - 1)) + 1)
features.append(dataSet.columns.values[-1])
rows = [random.randint(0, n-1) for _ in range(n)]
trainData = dataSet.iloc[rows][features]
return trainData.values.tolist(), features
#def majorityCnt(classList):
#classCount = {}
#for vote in classList:
#if vote not in classCount.keys():
#classCount[vote] = 0
#classCount[vote] += 1
#sortClass = sorted(labelDict.items(), key=lambda item: item[1])
#return sortClass[-1][0]
def ad_vs_nc(train_set,test_set,n_trees):
df = pd.DataFrame(train_set) # this is train dataset
labels = df.columns.values.tolist()
df = df[df[labels[-1]] != 2]
treeCounts = n_trees
treeList = []
for i in range(treeCounts): #get forest
baggingData, bagginglabels = baggingDataSet(df)
decisionTree = createTree(baggingData, bagginglabels)
treeList.append(decisionTree)
print( treeList,'treelist')
predictions = []
for row in test_set:
labelPred = []
for tree in treeList:
testData = row
label = treeClassify(tree, labels[:-1], testData)
labelPred.append(label)
predictions.append(majorClass(labelPred))
return predictions
def change_y_test(y_test):
actualList = []
for i in range(len(y_test)):
#if y_test[i][0] != 2:
actualList.extend(y_test[i])
return actualList
def accuracy_cal(actual, predicted):
print(actual,'actual')
print(predicted,'predicted')
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
def evaluate_algorithm():
filename = 'f:\\ADNI202.mat'
data,labels = load_data(filename)
scores = []
n_trees = 100
for i in range(10):
x_train,x_test,y_train,y_test = train_test_split(data,labels,test_size = 0.1,random_state = 0)
#这是sklearn的方法,自动分测试集和训练集 x_train是测试样本的特征值
#y_train是测试样本的类别值
#train_set = np.append(x_train,y_train,axis = 1)
train_set = np.append(x_train,y_train,axis = 1)
train_set = pd.DataFrame(train_set)
test_set = x_test.tolist()
actual = change_y_test(y_test)
predicted = ad_vs_nc(train_set,test_set,n_trees)
accuracy = accuracy_cal(actual,predicted)
scores.append(accuracy)
print("trees {0}".format(n_trees))
print('scores{0}'.format(scores))
print('mean accuracy{0}'.format(sum(scores)/float(len(scores))))
evaluate_algorithm()