注:本文不涉及決策樹理論部分,若有興趣請移步☞https://blog.csdn.net/weixin_43462348/article/details/101975391☜
步驟概覽:
R實現
0 加載所需包
library(rpart)
library(rattle) # 畫圖工具
library(RColorBrewer) # 調色板
1 導入數據+瞭解數據
setwd("E:/r") # 數據儲存至目錄下
data<- read.csv("taitanic.csv",stringsAsFactors=F,na.strings="")
View(data) # 初探數據,保證無串行情況,確認導入正確
head(data) # 查看前幾行
str(data) # 查看數據結構
library(Hmisc)
describe(data)
'Name’變量有891個不同取值,考慮無實際用途;'Ticket’票號沒有實際意義;Age和Cabin存在缺失值,其中Cabin缺失值超過50%,考慮刪除,Embarked僅兩個缺失值考慮直接刪除所在行。
2 數據預處理
①刪除上述無用變量
library(dplyr)
data <- data[,!(colnames(data) %in% c('Name','Ticket','Cabin'))]
②缺失值處理
library(VIM)
aggr(data, plot=T)
#均值填補
data[is.na(data$Age),"Age"]=mean(data$Age,na.rm=T)
data <- data[complete.cases(data),] # 取不含缺失的所有數據
#剩餘缺失值刪除
data <- data[complete.cases(data),]
處理後數據查看:
③數據轉換
data$Survived <- as.factor(data$Survived)
data$Pclass <- as.factor(data$Pclass)
data$Sex <- as.factor(data$Sex)
data$Embarked <- as.factor(data$Embarked)
str(data)
3 數據分割
library(caret)
set.seed(2019)
intrain <- createDataPartition(data$Survived,p=0.7,list=FALSE)
train <- data[intrain,]
test <- data[-intrain,]
4 建模
set.seed(2020)
model1 <- rpart(Survived~.,train,method='class',parms=list(split = "gini"))
model1
fancyRpartPlot(model1)
p1 <- predict(model1,test,type='class')
p1
#預測準確率計算
A <- as.matrix(table(p1,test$Survived))
acc <- sum(diag(A))/sum(A)
acc # 0.8195489
5 調參
accuracy <- function(min_bucket,max_depth,com){
model <- rpart(Survived ~ ., data = train, method="class",
control=rpart.control(minbucket=min_bucket, maxdepth = max_depth
, cp=com))
pp <- predict(model,test,type="class")
A <- as.matrix(table(pp,test$Survived))
return(sum(diag(A))/sum(A))
}
accu=0
for (i in 1:10){
for (j in seq(1,50,5)){
for (z in round(runif(50,0,0.1),4)){
acc=accuracy(j,i,z)
if (acc>=accu){
accu = acc
print(paste("i的值爲:",i))
print(paste("j的值爲:",j))
print(paste("z的值爲:",z))
print(accu)
}else{
print('NULL')
}
}
}
}
accuracy(6,10,0.0036) # 0.84586
最終得到最佳accuracy的參數:maxdepth=10,minbucket=6,cp=0.0036,accuracy值達到0.84586。
注:本文調參代碼爲本人原創,缺點在於for循環嵌套時間複雜度過大,如果有BUG或其他方法的小夥伴歡迎私信一起討論!
python實現
0 導入所需庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
1 導入數據並查看
data = pd.read_csv("taitanic.csv")
data.info()
data.head()
2 數據預處理
#① 刪除不用特徵
data.drop(["Name","Cabin","Ticket"],inplace=True,axis=1)
data.head()
#② 缺失值填補
data["Age"]=data["Age"].fillna(data["Age"].mean())
data = data.dropna() # 極少缺失值的刪除處理
data.info()
#③ 分類型變量文本轉數值
data["Embarked"].unique().tolist()
data["Embarked"] = data["Embarked"].map({'S':0,'C':1,'Q':2})
data["Sex"].unique().tolist()
data["Sex"] = data["Sex"].map({'female':0,'male':1})
3 數據分割
y = data["Survived"]
x = data.drop(["Survived"],inplace = False,axis = 1)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3)
#調整索引:
for i in [xtrain,xtest,ytrain,ytest]:
i.index = range(i.shape[0])
x.head()
4 模型建立
clf = DecisionTreeClassifier(random_state = 17)
clf = clf.fit(xtrain,ytrain)
score = clf.score(xtest,ytest)
score
#交叉驗證
score = cross_val_score(clf,x,y,cv=10).mean()
score # 0.7425051072522983
5 調參
Parameters = {"criterion":('gini','entropy')
,"splitter":('best','random')
,"max_depth":[*range(1,10)]
,"min_samples_leaf":[*range(1,50,5)]
,"min_impurity_decrease":[*np.linspace(0,0.5,50)]
}
Clf = DecisionTreeClassifier(random_state=17)
GS=GridSearchCV(clf,parameters,cv=10)
GS=GS.fit(xtrain,ytrain)
GS.best_params_
GS.best_score_ # 0.8183279742765274
(由於沒有全部設置隨機種子,每次結果有所不同,昨天跑到了0.83但是費時太多)綜上所述,R調參費時較少且最優accuracy達到0.84。
注:粗略計算了下,pyhton中調參用時20mins(包含交叉驗證),R中2mins(無交叉驗證)。
PS.若有問題歡迎討論指正!