關注微信公共號:小程在線
關注CSDN博客:程志偉的博客
R版本:3.6.1
randomForest包:提供randomForest()函數用於隨機森林的建立
rflmpute()函數:對數據缺失值進行插補
treesize()函數: 查看模型每顆決策樹的節點數
importtance():提取模型中各變量對模型的重要性
> setwd('G:\\R語言\\大三下半年\\數據挖掘:R語言實戰\\')
> library('randomForest')
> set.seed(4)
> data(mtcars)
> mtcars.rf=randomForest(mpg~.,data=mtcars,ntree=1000,importance=TRUE)
#提取模型中的重要值
> importance(mtcars.rf)
%IncMSE IncNodePurity
cyl 16.151445 154.16459
disp 18.833040 255.10218
hp 18.641110 201.42227
drat 6.343488 65.96680
wt 19.987072 247.29443
qsec 4.656151 30.95240
vs 5.627916 27.14099
am 4.064642 15.18171
gear 5.825897 20.12545
carb 9.383633 31.03605
> #MDSplot函數繪製座標圖
> set.seed(1)
> data(iris)
> iris.rf=randomForest(Species~.,iris,proximity=T)
> MDSplot(iris.rf,iris$Species,palette=rep(1,3),pch=as.numeric(iris$Species))
#rflmpute()函數可以對缺失值進行插值。
> data("iris")
> iris.na=iris
> iris.na[75,2]=NA;iris.na[125,3]=NA;
> set.seed(111)
> iris.imputed=rfImpute(Species~.,data=iris.na)
ntree OOB 1 2 3
300: 4.67% 0.00% 6.00% 8.00%
ntree OOB 1 2 3
300: 4.67% 0.00% 6.00% 8.00%
ntree OOB 1 2 3
300: 4.00% 0.00% 6.00% 6.00%
ntree OOB 1 2 3
300: 4.67% 0.00% 6.00% 8.00%
ntree OOB 1 2 3
300: 4.67% 0.00% 6.00% 8.00%
#通過對缺失值插補,可以看出非常接近實際值
> list("real"=iris[c(75,125),1:4],"have-NA"=iris.na[c(75,125),1:4],
+ "disposed"=round(iris.imputed[c(75,125),2:5],1))
$real
Sepal.Length Sepal.Width Petal.Length Petal.Width
75 6.4 2.9 4.3 1.3
125 6.7 3.3 5.7 2.1
$`have-NA`
Sepal.Length Sepal.Width Petal.Length Petal.Width
75 6.4 NA 4.3 1.3
125 6.7 3.3 NA 2.1
$disposed
Sepal.Length Sepal.Width Petal.Length Petal.Width
75 6.4 2.8 4.3 1.3
125 6.7 3.3 5.6 2.1
#treesize()函數可以查看隨機森林中決策樹的個數
> iris.rf<-randomForest(Species~.,iris)
> hist(treesize(iris.rf))
#可視化分析
> data(airquality)
> set.seed(131)
> ozone.rf=randomForest(Ozone~.,data=airquality,mtry=3,importance=T,
+ na.action=na.omit)
> plot(ozone.rf) #模型誤差在210之後沒有太大的變化
############## 應用案例 #################
> wine=read.csv("G:\\R語言\\大三下半年\\數據挖掘:R語言實戰\\數據挖掘:R語言實戰(案例數據集)\\11 隨機森林\\winequality-white.csv",header=T,sep = ";",na.strings="null")
> summary(wine)
fixed.acidity volatile.acidity citric.acid residual.sugar
Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
chlorides free.sulfur.dioxide total.sulfur.dioxide
Min. :0.00900 Min. : 2.00 Min. : 9.0
1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
Median :0.04300 Median : 34.00 Median :134.0
Mean :0.04577 Mean : 35.31 Mean :138.4
3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
Max. :0.34600 Max. :289.00 Max. :440.0
density pH sulphates alcohol
Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
quality
Min. :3.000
1st Qu.:5.000
Median :6.000
Mean :5.878
3rd Qu.:6.000
Max. :9.000
#設置中間變量對處理後的向量進行臨時存儲
> for(i in 1:4898)#只對每一個樣本進行調整
+ {
+ if(wine[i,12]>6)cha[i]="good"
+ else if(wine[i,12]>5)cha[i]="mid"
+ else cha[i]="bad"
+ }
> wine[,12]=factor(cha)#將字符型變量轉化爲含有因子的變量賦值給數據集wine
> summary(wine$quality)
bad good mid
1640 1060 2198
#利用第一種格式
> set.seed(71)#設置隨機數生成器初始值
> samp=sample(1:4898,3000)
> set.seed(111)
> wine.rf=randomForest(quality~.,data=wine,importance=TRUE,proximity=TRUE,ntree=500,subest=samp)#構建決策樹爲500棵的隨機森林模型
#利用第二種格式
> x=subset(wine,select=-quality)#除quality以外的數據爲自變量
> y=wine$quality#提取quality爲響應變量
> set.seed(71)
> samp=sample(1:4898,3000)
> xr=x[samp,];yr=y[samp]
> set.seed(111)
> wine.rf=randomForest(xr,yr,importance=TRUE,proximity=TRUE,ntree=500)
#輸出模型
#Type of random forest: classification,表示該模型爲判別模型
#Number of trees:包含500顆決策樹
#No. of variables tried at each split:表示沒顆決策樹節點處所選擇的變量個數爲3
#OOB estimate of error rate:模型的誤差爲30.57%
> print(wine.rf)
Call:
randomForest(x = xr, y = yr, ntree = 500, importance = TRUE, proximity = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 30.57%
Confusion matrix:
bad good mid class.error
bad 697 21 283 0.3036963
good 12 392 238 0.3894081
mid 227 136 994 0.2675018
#對模型提取重要值列表
> importance(wine.rf)
bad good mid MeanDecreaseAccuracy
fixed.acidity 38.39220 38.04701 28.23129 54.35687
volatile.acidity 60.62095 57.14363 47.73287 81.72092
citric.acid 33.95011 37.38719 27.88769 46.44210
residual.sugar 31.46575 36.43637 35.79771 55.19329
chlorides 44.52846 46.47096 27.53577 56.75493
free.sulfur.dioxide 45.25569 41.41413 34.22959 64.04598
total.sulfur.dioxide 34.41994 41.26107 27.18189 50.61239
density 31.79573 42.52763 32.02053 53.63531
pH 33.29907 44.35856 26.93746 51.25421
sulphates 31.49062 36.61428 28.59253 50.10378
alcohol 66.08129 66.88405 32.11592 82.88410
MeanDecreaseGini
fixed.acidity 138.8276
volatile.acidity 195.5095
citric.acid 149.4469
residual.sugar 163.2376
chlorides 169.0678
free.sulfur.dioxide 178.9195
total.sulfur.dioxide 170.5083
density 204.5551
pH 158.8621
sulphates 145.9429
alcohol 240.0514
varImpPlot(wine.rf, main = "variable importance")
#優化模型
> mtry_error=0
> for(i in 1:(ncol(wine)-1))
+ {
+ set.seed(100)
+ newModel=randomForest(quality~.,data=wine,mtry=i,importance=TRUE,ntree=1000)
+ mtry_error[i]=mean(newModel$err.rate)
+ }
> mtry_error
[1] 0.2641172 0.2644469 0.2655955 0.2648887 0.2635614 0.2658425 0.2640577
[8] 0.2657842 0.2668288 0.2663740 0.2700658
#通過上面的分析,節點個數爲5錯誤率最小
> plot(mtry_error,xlab = "mtry",ylab = "error",type="l")
> set.seed(222)
> Model=randomForest(quality~.,data=wine,mtry=5,importance=TRUE,ntree=1000)
> plot(model)#繪製模型誤差與決策樹數量關係圖
#通過上面的分析,數量爲400,節點個數爲5
> set.seed(222)
> Model=randomForest(quality~.,data=wine,mtry=5,importance=TRUE,ntree=400)
> print(Model)
Call:
randomForest(formula = quality ~ ., data = wine, mtry = 5, importance = TRUE, ntree = 400)
Type of random forest: classification
Number of trees: 400
No. of variables tried at each split: 5
OOB estimate of error rate: 25.36%
Confusion matrix:
bad good mid class.error
bad 1227 16 397 0.2518293
good 25 703 332 0.3367925
mid 294 178 1726 0.2147407
> hist(treesize(Model))#展示隨機森林中每棵樹決策樹的節點數