海林老師《數據挖掘》(韓佳煒書)課程作業系列
要求:自己寫R/Python代碼、函數實現一系列算法
其他參見:
全文邏輯:
- 分析
- 算法/函數
- 測試數據
- 測試代碼
- 測試結果(截圖)
分析:
#輸入訓練集(數據框,要求其最後一列爲類別)和測試集(向量)
#輸出print在每個類的概率值
#返回訓練集的類別
算法實現(編寫函數):
bys<-function(data,test){
#輸出分類名
#默認最後一列爲分類
get_classname<-function(data){
classes=data[,ncol(data)]
return(names(table(classes)))
}
#根據分類切割數據
#classname爲一個字符串
split_data<-function(data,classname){
data_fen=subset(data,data[,ncol(data)]==classname)
data_fen=droplevels(data_fen)
return(data_fen)
}
#計算某個分類比例
#傳入分類名
p_class<-function(data,classname){
return(nrow(data[data$class==classname,])/nrow(data))
}
#計算 某個條件屬性,某個分類屬性的概率 即求P(x1|C1)
###attr_index屬性列序號
###attr_value屬性的具體值
###classname某個分類名
p_attr_class<-function(data_fen,attr_index,attr_value){
count_classname=nrow(data_fen)
count_attr=nrow(data_fen[data_fen[,attr_index]==attr_value,])
return(count_attr/count_classname)
}
#輸出結果函數(爲test選擇最合適的類別)
p<-function(data,test){
classname=get_classname(data)
print(classname)
p_cs=c()
for (i in 1:length(classname)) {
p_class_i=p_class(data,classname[i])
data_fen=split_data(data,classname[i])
p_x_ci=1
for (j in 1:length(test)) {
p_x_ci=p_x_ci* p_attr_class(data_fen,j,test[j])
}
p_cs[i]=p_class_i*p_x_ci
}
print(p_cs)
##選擇概率最大的
index=which(rank(-as.numeric(p_cs),ties.method="first")==1)
return(classname[index])
}
return(p(data,test))
}
數據測試:
測試數據:
如下
########################################訓練集
age=c("youth","youth","youth","youth","youth","middle_aged","middle_aged","middle_aged","middle_aged","senior","senior","senior","senior","senior")
income=c("high","high","medium","low","medium","high","low","medium","high","medium","low","low","medium","medium")
student=c("no","no","no","yes","yes","no","yes","no","yes","no","yes","yes","yes","no")
credit_rating=c("fair","excellent","fair","fair","excellent","fair","excellent","excellent","fair","fair","fair","excellent","fair","excellent")
class=c("no","no","no","yes","yes","yes","yes","yes","yes","yes","yes","no","yes","no")
train<-data.frame(age,income,student,credit_rating,class,stringsAsFactors = F)
#################################################測試集
#測試集:不考慮缺失值情況
test<-c("youth","medium","yes","fair")
xx=bys(train,test)