海林老師《數據挖掘》課程作業系列
要求:自己寫R/Python代碼、函數實現一系列算法
其他參見:
全文邏輯:(讀者可將所有代碼按順序複製到RStudio,全選ctrl+A,運行ctrl+enter,查看結果)
- 分析
- 算法/函數
- 測試數據
- 測試代碼
- 測試結果(截圖)
分析:這個很難!!(難在遞歸生成樹)
#實現 輸入訓練集 輸出ID3方式得到的決策樹(列表)
#輸入:訓練集數據框(要求最後一列爲類別)
#輸出:顯示劃分屬性
#返回:列表,按分類屬性的可取值分的,
######[[1]]表示爲某個屬性時,
######列表最裏層[[2]]表示此路徑的最終類別
算法實現(編寫函數):
(1)生成決策樹:
id3_jcs<-function(data){
#求給定列的信息熵Info
##data數據框
##默認最後一列爲分類Label
info<-function(data){
count_all=nrow(data)
count_fen=as.numeric(table(data[,ncol(data)]))
result=0
for (i in 1:length(count_fen)) {
rate=count_fen[i]/count_all
result=result-(rate*log(rate,2))
}
return(result)
}
#print(info(train))
#求某屬性分類,得到的總信息熵
##col_fen:要對該列屬性進行分類
info_col<-function(data,col_fen){
leibie=names(table(data[,col_fen]))
count_all=nrow(data)
result=0
for (i in 1:length(leibie)) {
data[,col_fen]=as.character(data[,col_fen])
data_fen=subset(data,data[,col_fen]==leibie[i])
data_fen=droplevels(data_fen)#除去無用因子
count_fen=nrow(data_fen)
info_fen=info(data_fen)
result=result+info_fen*(count_fen/count_all)
}
return(result)
}
#x=info_col(train,1)
#x=info_col(train,"age")
#根據屬性各取值,切割數據框
###屬性爲列名
###返回列表
split_data<-function(data,col_fen){
leibie=names(table(data[,col_fen]))
leibie_num=length(leibie)
data[,col_fen]=as.character(data[,col_fen])
data_fens=NULL
for (i in 1:leibie_num) {
data_fen=subset(data,data[,col_fen]==leibie[i])
data_fen=droplevels(data_fen)
data_fen=data_fen[,-col_fen]
print("------------------------------------")
print(leibie[i])
print("==========================================")
print(data_fen)
data_fens[[i]]=list(leibie[i],data_fen)
}
return(data_fens)
}
#xx=split_data(train,1)
#xx[[1]][[1]]#獲取第一個分類依據
#xx[[1]][[2]]#獲取第一個分類之後的數據框,不含劃分屬性
#xx[[2]][[1]]
#test=xx[[2]][[2]]
#a1=best(test)
#test2=split_data(test,a1)
#找到最好的劃分屬性
###返回所在列數
best<-function(data){
info_all=info(data)#計算父表的信息增益
gains=c(rep(0,ncol(data)))
if(ncol(data)>1){
for (i in 1:(ncol(data)-1)) {
gains[i]=info_all-info_col(data,i)#得到子表的信息增益
}
}
index=which(rank(-gains,ties.method="first")==1)#得到劃分屬性的列數
return(index)
}
#best(train)
#如果只有一列,而結果卻多個的話,採用多數表決
#找出出現次數最多的分類名稱
###傳入數據框(一列,是最後一列,爲分類Y N 等)
majorityCnt<-function(data){
data_fen<-table(data)
index=which(rank(-as.numeric(data_fen),ties.method="first")==1)#得到劃分屬性的列數
return(names(data_fen)[index])
}
#根據列數和數據框,輸出列名
col_name<-function(col_count,data){
return(names(data)[col_count])
}
#############################遞歸創建樹
createTree<-function(data){
#遞歸停止條件
if(length(data)==1){
return(majorityCnt(data))
}
if(length(as.numeric(table(data[,ncol(data)])))==1){
return(data[1,ncol(data)])
}
if(nrow(data)==0){
return()
}
#根據傳入數據求最佳分類屬性的位置
bestFeature=best(data)
#輸出劃分屬性的名稱
print(paste0("********我是劃分屬性:",col_name(bestFeature,data),"************"))
#根據這個位置,對數據進行分類,得到分類之後的列表
data_fens=split_data(data,bestFeature)
#遍歷分類後的數據框,對每個框進行從上到下相同的操作
data_fen_lists=NULL
for (i in 1:length(data_fens)) {
data_fen_label=c(col_name(bestFeature,data),data_fens[[i]][[1]])
data_fen=data_fens[[i]][[2]]
data_fen_list=createTree(data_fen)
data_fen_lists[[i]]=list(data_fen_label,data_fen_list)
}
return(data_fen_lists)
}
return(createTree(data))
}
(2)實現分類
classify<-function(tree,test){
xh=function(treelist){
for (i in 1:length(treelist)) {
label=treelist[[i]][[1]][1]
if(test[1,label]==treelist[[i]][[1]][2]){
result=treelist[[i]][[2]]
if(!is.list(result)) return(result)
aa=xh(result)
return(aa)
}
}
}
return(xh(tree))
}
數據測試:
測試數據:
書上的數據
訓練數據選前13行
最後一行用於測試
age=c("youth","youth","youth","youth","youth","middle_aged","middle_aged","middle_aged","middle_aged","senior","senior","senior","senior","senior")
income=c("high","high","medium","low","medium","high","low","medium","high","medium","low","low","medium","medium")
student=c("no","no","no","yes","yes","no","yes","no","yes","no","yes","yes","yes","no")
credit_rating=c("fair","excellent","fair","fair","excellent","fair","excellent","excellent","fair","fair","fair","excellent","fair","excellent")
class=c("no","no","no","yes","yes","yes","yes","yes","yes","yes","yes","no","yes","no")
data<-data.frame(age,income,student,credit_rating,class,stringsAsFactors = F)
train<-data[1:13,]
test<-data[14:14,]
#訓練集測試 結果存在xx中
xx=id3_jcs(train)
#測試集一條記錄
test
mmm=classify(xx,test)
mmm
#若測試集多條記錄
test2=data[12:14,]
test2
for (i in 1:nrow(test2)) {
print(classify(xx,test2[i,]))
}