关注微信公共号:小程在线
关注CSDN博客:程志伟的博客
详细内容为 《R语言游戏数据分析与挖掘》第五章学习笔记之数据清洗
5.3.1 数据转换
5.2.2 哑变量处理
# 产生衍生变量
> rawdata <- read.csv("数据转换数据.csv",na.strings = NA)
> # 查看数据的前六行
> head(rawdata)
playerid registration firstpaydate days lifetime
1 1001984428 20160408 NA 4 101
2 1002360742 20160407 20160407 12 16
3 1003943907 20160423 NA 1 1
4 100500571 20160406 20160407 10 101
5 1005541598 20160414 NA 1 1
6 1007334849 20160426 NA 2 2
# 将注册日期变量转换成日期格式
> rawdata$registration <- as.Date(paste(substr(rawdata$registration,1,4),
+ substr(rawdata$registration,5,6),
+ substr(rawdata$registration,7,8),
+ sep="/"),
+ "%Y/%m/%d")
# 将首次付费日期转换成日期格式
> rawdata$firstpaydate <- as.Date(paste(substr(rawdata$firstpaydate,1,4),
+ substr(rawdata$firstpaydate,5,6),
+ substr(rawdata$firstpaydate,7,8),
+ sep="/"),
+ "%Y/%m/%d")
# 查看数据的前六行
> head(rawdata)
playerid registration firstpaydate days lifetime
1 1001984428 2016-04-08 <NA> 4 101
2 1002360742 2016-04-07 2016-04-07 12 16
3 1003943907 2016-04-23 <NA> 1 1
4 100500571 2016-04-06 2016-04-07 10 101
5 1005541598 2016-04-14 <NA> 1 1
6 1007334849 2016-04-26 <NA> 2 2
# 增加ispay变量:0表示非付费用户,1表示付费用户
> rawdata$ispay <- ifelse(!is.na(rawdata$firstpaydate),1,0)
# 增加isnewpay变量:0表示非新增首日付费用户,1表示新增首日付费用户
> rawdata$isnewpay <- ifelse(rawdata$registration==rawdata$firstpaydate,
+ 1,0)
> rawdata[is.na(rawdata$isnewpay),'isnewpay'] <- 0
# 查看数据前6行
> head(rawdata)
playerid registration firstpaydate days lifetime ispay isnewpay
1 1001984428 2016-04-08 <NA> 4 101 0 0
2 1002360742 2016-04-07 2016-04-07 12 16 1 1
3 1003943907 2016-04-23 <NA> 1 1 0 0
4 100500571 2016-04-06 2016-04-07 10 101 1 0
5 1005541598 2016-04-14 <NA> 1 1 0 0
6 1007334849 2016-04-26 <NA> 2 2 0 0
# 5.3.2 数据分箱
> # 利用cut函数对数据进行分箱
> # 对days(活跃天数)进行分箱操作
> rawdata$days_interval <- cut(rawdata$days,
+ breaks=c(0,30,60,90,Inf),
+ labels=c('一个月内','31~60天','61~90天','三个月以上'))
> # 对lifetime(生命周期)进行分箱操作
> rawdata$lifetime_interval <- cut(rawdata$lifetime,
+ breaks=c(0,7,21,30,90,Inf),
+ labels=c('小于一周','小于两周','小于一个月',
+ '小于三个月','三个月以上'))
# 查看前六行
> head(rawdata)
playerid registration firstpaydate days lifetime ispay isnewpay
1 1001984428 2016-04-08 <NA> 4 101 0 0
2 1002360742 2016-04-07 2016-04-07 12 16 1 1
3 1003943907 2016-04-23 <NA> 1 1 0 0
4 100500571 2016-04-06 2016-04-07 10 101 1 0
5 1005541598 2016-04-14 <NA> 1 1 0 0
6 1007334849 2016-04-26 <NA> 2 2 0 0
days_interval lifetime_interval
1 一个月内 三个月以上
2 一个月内 小于两周
3 一个月内 小于一周
4 一个月内 三个月以上
5 一个月内 小于一周
6 一个月内 小于一周
# 5.3.3 数据标准化变换
#采用(x-mu)/std的标准化方法,与scale()函数效果一样
> standard <- preProcess(iris)
> head(predict(standard,iris))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa
2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa
3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa
4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa
5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa
6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa
> head(scale(iris[,1:4]))
Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,] -0.8976739 1.01560199 -1.335752 -1.311052
[2,] -1.1392005 -0.13153881 -1.335752 -1.311052
[3,] -1.3807271 0.32731751 -1.392399 -1.311052
[4,] -1.5014904 0.09788935 -1.279104 -1.311052
[5,] -1.0184372 1.24503015 -1.335752 -1.311052
[6,] -0.5353840 1.93331463 -1.165809 -1.048667
#采用(x-min(x))/(max(x)-min(x))的标准化方法
> standard <- preProcess(iris, method = 'range')
> head(predict(standard,iris))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 0.22222222 0.6250000 0.06779661 0.04166667 setosa
2 0.16666667 0.4166667 0.06779661 0.04166667 setosa
3 0.11111111 0.5000000 0.05084746 0.04166667 setosa
4 0.08333333 0.4583333 0.08474576 0.04166667 setosa
5 0.19444444 0.6666667 0.06779661 0.04166667 setosa
6 0.30555556 0.7916667 0.11864407 0.12500000 setosa
> fun <- function(x) (x-min(x))/(max(x)-min(x))
> head(sapply(iris[,1:4],fun))
Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,] 0.22222222 0.6250000 0.06779661 0.04166667
[2,] 0.16666667 0.4166667 0.06779661 0.04166667
[3,] 0.11111111 0.5000000 0.05084746 0.04166667
[4,] 0.08333333 0.4583333 0.08474576 0.04166667
[5,] 0.19444444 0.6666667 0.06779661 0.04166667
[6,] 0.30555556 0.7916667 0.11864407 0.12500000
# 5.4 数据哑变量处理
> # 构建customers数据集
> customers<-data.frame(id=c(10,20,30,40,50),
+ gender=c("male","female","female","male","female"),
+ mood=c("happy","sad","happy","sad","happy"),
+ outcome=c(1,1,0,0,0))
> customers
id gender mood outcome
1 10 male happy 1
2 20 female sad 1
3 30 female happy 0
4 40 male sad 0
5 50 female happy 0
# 对因子型变量进行哑变量处理
> # 创建新数据框customers.new
> customers.new <- customers[,c('id','outcome')]
# 对gender变量进行哑变量处理
> customers.new$gender.male <- ifelse(customers$gender=='male',1,0)
> customers.new$gender.female <- ifelse(customers$gender=='female',1,0)
# 对mood变量进行哑变量处理
> customers.new$mood.happy <- ifelse(customers$mood=='happy',1,0)
> customers.new$mood.sad <- ifelse(customers$mood=='sad',1,0)
> customers.new
id outcome gender.male gender.female mood.happy mood.sad
1 10 1 1 0 1 0
2 20 1 0 1 0 1
3 30 0 0 1 1 0
4 40 0 1 0 0 1
5 50 0 0 1 1 0
# 加载caret包到内存
> library(caret)
> # 查看customers的数据结构
> str(customers)
'data.frame': 5 obs. of 4 variables:
$ id : num 10 20 30 40 50
$ gender : Factor w/ 2 levels "female","male": 2 1 1 2 1
$ mood : Factor w/ 2 levels "happy","sad": 1 2 1 2 1
$ outcome: num 1 1 0 0 0
# 利用dummyVars函数对customers数据进行哑变量处理
> dmy<-dummyVars(~.,data=customers)
# 对自身变量进行预测,并转换成data.frame格式
> trsf<-data.frame(predict(dmy,newdata=customers))
# 查看转换结果
> trsf
id gender.female gender.male mood.happy mood.sad outcome
1 10 0 1 1 0 1
2 20 1 0 0 1 1
3 30 1 0 1 0 0
4 40 0 1 0 1 0
5 50 1 0 1 0 0
# 将outcome变量转换成因子型变量
> customers$outcome <- as.factor(customers$outcome)
# 利用dummyVars函数对customers数据进行哑变量处理
> dmy<-dummyVars(~.,data=customers)
# 对自身变量进行预测,并转换成data.frame格式
> trsf<-data.frame(predict(dmy,newdata=customers))
# 查看转换结果
> trsf
id gender.female gender.male mood.happy mood.sad outcome.0 outcome.1
1 10 0 1 1 0 0 1
2 20 1 0 0 1 0 1
3 30 1 0 1 0 1 0
4 40 0 1 0 1 1 0
5 50 1 0 1 0 1 0
# 只对gender变量进行哑变量转换
> dmy.gender <- dummyVars(~gender,data=customers)
> trsf.gender <- data.frame(predict(dmy.gender,newdata=customers))
> trsf.gender
gender.female gender.male
1 0 1
2 1 0
3 1 0
4 0 1
5 1 0
# 将levelsOnly和fullRank设置为TRUE
> customers<-data.frame(id=c(10,20,30,40,50),
+ gender=c("male","female","female","male","female"),
+ mood=c("happy","sad","happy","sad","happy"),
+ outcome=c(1,1,0,0,0))
> dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE)
> trsf<-data.frame(predict(dmy,newdata=customers))
> trsf
id male sad outcome
1 10 1 0 1
2 20 0 1 1
3 30 0 0 0
4 40 1 1 0
5 50 0 0 0