海林老师《数据挖掘》(韩佳炜书)课程作业系列
要求:自己写R/Python代码、函数实现一系列算法
其他参见:
全文逻辑:
- 分析
- 算法/函数
- 测试数据
- 测试代码
- 测试结果(截图)
分析:
#输入训练集(数据框,要求其最后一列为类别)和测试集(向量)
#输出print在每个类的概率值
#返回训练集的类别
算法实现(编写函数):
bys<-function(data,test){
#输出分类名
#默认最后一列为分类
get_classname<-function(data){
classes=data[,ncol(data)]
return(names(table(classes)))
}
#根据分类切割数据
#classname为一个字符串
split_data<-function(data,classname){
data_fen=subset(data,data[,ncol(data)]==classname)
data_fen=droplevels(data_fen)
return(data_fen)
}
#计算某个分类比例
#传入分类名
p_class<-function(data,classname){
return(nrow(data[data$class==classname,])/nrow(data))
}
#计算 某个条件属性,某个分类属性的概率 即求P(x1|C1)
###attr_index属性列序号
###attr_value属性的具体值
###classname某个分类名
p_attr_class<-function(data_fen,attr_index,attr_value){
count_classname=nrow(data_fen)
count_attr=nrow(data_fen[data_fen[,attr_index]==attr_value,])
return(count_attr/count_classname)
}
#输出结果函数(为test选择最合适的类别)
p<-function(data,test){
classname=get_classname(data)
print(classname)
p_cs=c()
for (i in 1:length(classname)) {
p_class_i=p_class(data,classname[i])
data_fen=split_data(data,classname[i])
p_x_ci=1
for (j in 1:length(test)) {
p_x_ci=p_x_ci* p_attr_class(data_fen,j,test[j])
}
p_cs[i]=p_class_i*p_x_ci
}
print(p_cs)
##选择概率最大的
index=which(rank(-as.numeric(p_cs),ties.method="first")==1)
return(classname[index])
}
return(p(data,test))
}
数据测试:
测试数据:
如下
########################################训练集
age=c("youth","youth","youth","youth","youth","middle_aged","middle_aged","middle_aged","middle_aged","senior","senior","senior","senior","senior")
income=c("high","high","medium","low","medium","high","low","medium","high","medium","low","low","medium","medium")
student=c("no","no","no","yes","yes","no","yes","no","yes","no","yes","yes","yes","no")
credit_rating=c("fair","excellent","fair","fair","excellent","fair","excellent","excellent","fair","fair","fair","excellent","fair","excellent")
class=c("no","no","no","yes","yes","yes","yes","yes","yes","yes","yes","no","yes","no")
train<-data.frame(age,income,student,credit_rating,class,stringsAsFactors = F)
#################################################测试集
#测试集:不考虑缺失值情况
test<-c("youth","medium","yes","fair")
xx=bys(train,test)