根據表5.2所示的數據,用平方誤差損失準則生成二叉迴歸樹
二叉迴歸樹的算法爲:
其中公式5.21中的c1,c2分別爲R1和R2上數據的平均值
代碼仿照機器學習實戰上關於決策樹實現(遞歸的建立一棵樹)保存爲cart.py:
#coding:utf-8
import numpy as np
#數據集
def createDataSet():
dataSet = [4.5,4.75,4.91,5.34,5.8,7.05,7.9,8.23,8.7,9]
datalabel = [1,2,3,4,5,6,7,8,9,10]
return dataSet,datalabel
#計算數據集的平方誤差
def calcMSE(dataSet):
means = np.mean(dataSet)
sums = sum([(i-means)*(i-means) for i in dataSet])*1.0
return sums
#選擇最優的劃分點
def chooseBestFeatureToSplit(dataSet):
nums = len(dataSet)-1
if nums == 0:
return 0
best = 0
bestMES = 100000
for i in range(nums):
temp = calcMSE(dataSet[:i+1]) + calcMSE(dataSet[i+1:])
if temp <= bestMES:
bestMES = temp
best = i
return best
# def getkeyofromvalue(dataSet,value):
# u = -1
# for i in range(len(dataSet)):
# if dataSet[i] == value:
# u = i
# return u
#建樹過程
def createTree(dataSet,datalabel,left,right):
if right-left == 1:
#return dataSet[left]
return datalabel[left]
if left >= right:
return -1
#最優劃分函數加上left爲原數據集上的最優劃分下標
bestchoose = left + chooseBestFeatureToSplit(dataSet[left:right])
#print bestchoose+1
mytree = {datalabel[bestchoose]:{}}
mytree[datalabel[bestchoose]]['left'] = createTree(dataSet,datalabel,left,bestchoose)
mytree[datalabel[bestchoose]]['right'] = createTree(dataSet,datalabel,bestchoose+1,right)
return mytree
調用方法:
import cart
mydat,myla = cart.createDataSet()
myt = cart.createTree(mydat,myla,0,len(mydat))
print myt
結果(沒有進行可視化操作):