R筆記(簡單數據處理)

R語言筆記

#設定R軟件當前工作目錄
setwd("E:/R work")
#顯示R軟件當前工作目錄
getwd()
#R語言數據預處理常用包安裝
#plyr,reshape2,lubridate, stringr
install.packages(c("plyr","reshape2","lubridate", "stringr","foreign"))
library(MASS)
library(foreign)
library(stringr)
library(plyr)
library(reshape2)
library(ggplot2)
#####1.R語言數據讀取#####
#R包自帶數據
data(diamonds)
diamonds
#查看前六行數據
head(diamonds)
#查看後六行數據
tail(diamonds)

#R添加包,可以從一些開放源直接下載金融數據,包括雅虎財經、谷歌財經、等
install.packages("quantmod")
library(quantmod) #加載包

#從雅虎財經下載蘋果股票交易數據(從2015年1月1日至今)
getSymbols("AAPL",from="2015-01-01")  
#查看數組維數及元素個數
dim(AAPL) 
head(AAPL) 
tail(AAPL)  
#作圖,K線圖
chartSeries(AAPL,theme=chartTheme('black'))   

#從oanda獲取外匯數據
install.packages("jsonlite")
library(jsonlite)
getFX("USD/CNY",from="2017-05-01")  
head(USDCNY)
tail(USDCNY)
chartSeries(USDCNY,theme = chartTheme('black'))

#read.table函數讀取本地/網絡數據(read.table, read.csv, read.csv2, read.delim, read.delim2, read.fwf)
help("read.table")
##read.table函數

getwd()
#原始數據有列名,第一列爲記錄序號,可以省略參數header(但此時應當爲TRUE)
rt = read.table("houses.data");rt  
rt1 = read.table("houses.data",header = TRUE);rt1 

#原始數據有列名,無記錄序號列,不可以省略參數header
rt2 = read.table("houses2.data",header = TRUE);rt2  
rt2 = read.table("houses2.data");rt2  # 省略參數header(此時爲FALSE),變量名會被認爲是一行數據

#原始數據無列名,無記錄序號列,可以省略參數header(此時爲FALSE)
rt3 = read.table("houses3.data");rt3  
rt3 = read.table("houses3.data",
                 col.names = c("Price","Floor","Area","Rooms","Age","Cent.heat"));rt3 

#read.csv函數
dat = read.csv('PM.csv')  #編碼錯誤,讀入亂碼,行數也會錯亂
dat1 = read.csv('PM.csv',fileEncoding = "utf-8")  #指定正確編碼

#以下操作不讀取表頭,並重新制定列名
colname=c('id','city','index','y','x')
dat2 = read.csv('PM.csv',header=FALSE,col.name=colname,fileEncoding = "utf-8")

#當數據量較大時,全部將數據讀取會比較耗時,這裏可以通過nrows設定
dat3 = read.csv('PM.csv',fileEncoding = "utf-8",nrows=-1)  #nrows默認爲-1
dat4 = read.csv('PM.csv',fileEncoding = "utf-8",nrows=5)  #nrows設置爲5

#因子轉換
dat5 = read.csv('PM.csv',stringsAsFactors=FALSE,fileEncoding = "utf-8") #讀取爲string格式
str(dat5)
dat6 = read.csv('PM.csv',fileEncoding = "utf-8") #讀取爲factor格式
str(dat6)

#文件編碼
dat7 = read.csv('PM.csv',fileEncoding = "utf-8") #默認編碼不是utf-8,需要設置
dat8 = read.csv('PM-gbk.csv') #這裏默認編碼是gbk,不需要設置

#最後一行沒有回車符會有警告“最後一行不完整”
x=read.table("data1.txt",sep=",");x
person=read.csv("data1.txt", header=FALSE,col.names=c("age","height"))
person
##scan函數讀取結構化數據
#15名學生的體重
w = scan("weight.data");w  #默認讀爲數值向量
w = scan("weight.data",what = 0);w  
w = scan("weight.data",what = c(""));w  #讀爲字符型向量
w = scan("weight.data",what = list(""));w  #讀爲list

#例100名學生的身高和體重被存在文件h_w.data中,其中1,3,5,7,9列爲身高,2,4,6,8,10列爲體重,
#試用scan函數讀入,並轉化爲數據框
dat = scan("h_w.data",what = list(height=0,weight=0))
df = as.data.frame(dat)

#scan函數讀入屏幕數據
names = scan(what = "")
zhangsan lisi wangwu maliu
names

##其他格式數據讀入
install.packages("foreign")
library(foreign)
#讀取SPSS文件,不加參數to.data.frame = T返回list
educ = read.spss("educ_scores.sav",to.data.frame = T)
educ = read.xport("educ_scores.xpt")  #讀取SAS文件
educ = read.S("educ_scores")  #讀取SPLUS文件
educ = read.dta("educ_scores.dta")  #讀取stata文件

#讀取excel表格數據
educ = read.delim("EDUC_SCORES.txt")  #轉化爲txt文件
educ = read.csv("educ_scores.csv")    #轉化爲csv文件
#利用xlsx包中的函數讀取
install.packages("xlsx")
library(xlsx)

#解決無法載入‘rJava’問題方法
install.packages("rJava")
Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_77')  #自己的JAVA64路徑
library(rJava)
library(xlsx)
#這裏默認header=T,sheetIndex = 1表示讀取第一個工作簿的數據,或通過指定工作簿名稱來讀取
educ = read.xlsx("educ_scores.xls",sheetIndex = 1)  
educ = read.xlsx("educ_scores.xls",sheetName = "educ_scores")  

##文本數據讀取
news = readLines('news.txt',encoding = "UTF-8")
news = readLines('news.txt',n=2,encoding = "UTF-8");news

#scan函數讀取爲列表
line = scan('news.txt',what=list(''),encoding = "UTF-8")
line = scan('news.txt',what=list(''),n=1,encoding = "UTF-8");line
#scan函數讀取爲向量
line = scan('news.txt',what=c(''),encoding = "UTF-8")
line = scan('news.txt',what=c(''),n=1,encoding = "UTF-8");line

##結構化數據寫入
write.table(educ,file = "educ_w.txt",append = T)
write.csv(educ,file = "educ_w.csv")

##文本數據寫入
writeLines(line,"news_w.txt")

sink("news_w1.txt")
cat(line)
sink()

y=read.table("http://www.jaredlander.com/data/Tomato%20First.csv",header=TRUE,sep=",")
#使用head(),str(),summary()函數來查看數據集
head(y)
str(y)
summary(y)

getwd()
#查看數據
data = read.table("salary.txt",header = T);data
mode(data)
class(data)
names(data)
colnames(data)
dim(data)

#####2.數據管理與變換######
##數據合併
a=c("Hongkong",1910,75.0,41.8)

data = read.table('salary.txt', header = T,stringsAsFactors = F)
data1=rbind(data,a)
data1[14:16,]

weight=c(150,135,210,140)  #數值型向量
height=c(65,61,70,65)
gender=c("F","F","M","F")  #字符型向量
stu=data.frame(weight,height,gender)
row.names(stu)=c("Alice","Bob","Cal","David")
stu[,"weight"]
stu["Cal",]  #獲取行
stu[1:2,1:2]
stu$weight  # ”$”用於取列
stu[["weight"]]  #雙括號+名稱
stu[[1]]  #雙括號+下標,用於數據框和列表數據的獲取
stu[,1]
#列名一致
index=list("City"=data$City,"Index"=1:15)  #建立另一個數據集index
index
data.index=merge(data,index,by="City")
data.index

#列名不一致
index1=list("City1"=data$City,"Index"=1:15)
index1
data.index1=merge(data,index1,by.x = "City",by.y = "City1")
data.index1
index2 = 1:15
data.index2=cbind(data,index2)


##選取數據子集
data[data$Salary>65,]
data[c(2,4),]
#選取價格指數等於65.6的行,注意要用雙等號==
data[data$Price==65.6,]

##數據排序
order.salary=order(data$Salary)  #返回的是該變量從小到大(默認)排序後的索引
order.salary
data[order.salary,]

sort.list(data$Salary)  #sort.list與order的作用一致
data[sort.list(data$Salary,decreasing = T),]

## 讀取數據
# 在當前目錄下搜索匹配文件名中有“Loan”的貸款申請成功數據
setwd("G:\\數據預處理")
thefilesL = dir(pattern = "^Loan");thefilesL
# 讀入各數據並將其放在同一個列表(list),若不指定參數stringsAsFactors = F,字符型的元數據將自動轉化爲因子型
# lapply函數對列表進行統一操作(R語言基礎PPT54)
# 第一行是描述性數據,需要跳過
LoanList0 = lapply(thefilesL, read.csv, stringsAsFactors = F, skip = 1)
# 按行合併不同的csv文件的貸款申請數據
Loan = do.call(rbind, LoanList0)
# 同樣讀入、合併Reject
thefilesR = dir(pattern = "^Reject")
RejectList0 = lapply(thefilesR, read.csv, stringsAsFactors = F, skip = 1)
Reject = do.call(rbind, RejectList0)
str(Loan)

library(dplyr)
Loan.df = tbl_df(Loan)
Loan.df
dim(Loan.df)
colnames(Loan.df)
## (1)添加新變量列
# 添加一列名爲dti的新變量,它是將變量列Debt.To.Income.Ratio去百分號得到的
#這裏sub函數用來將“%”替換爲“”
Reject.temp = mutate(Reject, dti = as.numeric(sub("%", "", Debt.To.Income.Ratio)))
#等同於下面的操作
Reject.temp1 = Reject
Reject.temp1$dti = as.numeric(sub("%", "", Reject.temp1$Debt.To.Income.Ratio))
## (2)選擇變量列
Reject.s = select(Reject.temp, Amount.Requested, dti, Risk_Score:State)
Reject.s1 = select(Reject.s, -Zip.Code, -Debt.To.Income.Ratio)
## (3)選擇滿足條件的觀測行
MA_Reject = filter(Reject.s1, Risk_Score>500&State == "MA")
## (4)排序
arrange(Reject.s1, State, Risk_Score, dti, Amount.Requested)
## (5)數據分組彙總
summarise(group_by(Loan,grade),                       #使用分類變量grade分組
          ave.amnt = mean(funded_amnt, na.rm = T),    #計算均值
          sd = sd(funded_amnt, na.rm = T),            #計算標準差
          n = sum(!is.na(funded_amnt)),               #計算各組樣本量(不計缺失值)
          se = sd/sqrt(n),                            #計算均值標準誤
          conf_upper = ave.amnt + qt(0.975, n-1)*se,  #計算置信上下限(t分佈)
          conf_lower = ave.amnt - qt(0.975, n-1)*se)
## 數據變換
# (1)最大值-最小值規範化
library(caret)
# 將Loan數據中的loan_amnt轉化到[0,1]
help(preProcess)
#先指定處理方法
trans = preProcess(select(Loan, loan_amnt), method = c("range"))
trans
#然後使用predict函數完成處理
transformed = predict(trans, select(Loan,loan_amnt))
head(transformed)
range(transformed)

# (2)標準化
trans = preProcess(select(Loan, loan_amnt), method = c("center","scale"))
trans
transformed = predict(trans, select(Loan,loan_amnt))
head(transformed)
mean(transformed[[1]]);var(transformed[[1]])
# (3)十進制正規化
max(abs(Loan$loan_amnt))
# (4)Box-Cox變換
library(e1071)
#計算偏度,發現是右偏
skewness(Loan$annual_inc,na.rm = T)
#選擇Loan數據集中的數值型變量
Loan.num = select(Loan, loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,dti,total_pymnt)
# 對每列數值型變量都計算其偏度係數
apply(Loan.num,2,skewness,na.rm = T)
# 爲了直方圖顯示效果,剔除年收入超過40萬美元的客戶
Loan.anin = Loan$annual_inc[-which(Loan$annual_inc>400000)]
library(caret)
# 使用樣本數據估計λ,估計值爲-0.1,但修正後的λ估計值爲0
BoxCoxTrans(Loan$annual_inc,na.rm = T)
par(mfrow=c(1,2))
hist(Loan.anin,xlab="natural units of annual_inc", main="Histogram: Original Data")
# 估計的λ爲0,使用log變換
hist(log(Loan$annual_inc),
     xlab = "log units of annual_inc",
     main = "Histogram: after log transformation"
     )


#####3.從原始數據到技術正確的數據#####
##一個小案例(deltons)
#step(1): Reading data
txt=readLines("data2.txt")  #readLines: when the rows in a data files are not uniformly formatted
txt
#step(2):Selecting lines containing data
I=grepl("^%",txt)
I
dat=txt[!I]
dat
#step(3):Split lines into separate fields
help(strsplit)
(fieldList=strsplit(dat,split=","))
#step(4):Standardize rows
#先定義一個對列表中單個元素處理的
assignFields=function(x)  #函數聲明
{
  out=character(3)
  #匹配list中的字符作爲輸出的第一列
  i=grepl("[[:alpha:]]",x)
  #print(i)
  out[1]=x[i]
  #將list中小於1890的作爲出生年份
  i=which(as.numeric(x)<1890)
  #print(i)
  out[2]=ifelse(length(i)>0,x[i],NA)  #若長度不大於0,則賦值爲NA
  #將list中大於1890的作爲死亡年份
  i=which(as.numeric(x)>1890)
  #print(i)
  out[3]=ifelse(length(i)>0,x[i],NA)  #若長度不大於0,則賦值爲NA
  return(out)
}
#演示
out=character(3)
out[1]
i=grepl("[[:alpha:]]",fieldList[[1]]);i
out[1] = fieldList[[1]][i];out

i=which(as.numeric(fieldList[[1]])<1890);i
out[2]=ifelse(length(i)>0,fieldList[[1]][i],NA);out

#lapply函數用來處理列表的每一個元素
standardFields=lapply(fieldList,assignFields) #apply a function over a list
standardFields
#step(5): transform a list to data.frame(將list轉化爲data.frame)
M=matrix(unlist(standardFields),nrow=length(standardFields),byrow=TRUE)  #copy into a matrix which is then coerced into a data.frame
#unlist() produce a vector which contains all the atomic components which occur in x
colnames(M)=c("name","birth","death")
M
deltons=as.data.frame(M,stringsAsFactors=FALSE)  #stringsAsFactors=FALSE 防止R把第一列默認成因子模式factor
deltons
#step(6):Normalize and coerce to correct types(強制轉換類型)
str(deltons)
deltons$birth=as.numeric(deltons$birth)
deltons$death=as.numeric(deltons$death)
deltons
str(deltons)

##分類變量處理
#分類型變量在R中存儲爲factor格式
#(1)改變因子水平排序
f=factor(c("small","large","large","small","medium")); f
levels(f) #默認是字母表順序
#手動輸入改變
f1=factor(f,levels=c("small","medium","large")); f1
#rev函數逆轉原來的排序
f2=factor(f1,levels=rev(levels(f1))); f2 
#relevel函數決定因子水平從哪一個開始
f3 = relevel(f2,ref="small"); f3

##根據數值型變量改變因子水平排序,函數:reorder 
iss=InsectSprays  #R包數據:昆蟲噴霧劑
iss
#未重新排序前畫箱線圖,按照默認順序排序
iss$spray
boxplot(count~spray,data=iss)   #箱線圖
#重新排序後箱線圖按照count的均值從小到大排序
iss$spray=reorder(iss$spray,iss$count,FUN=mean)
iss$spray
boxplot(count~spray,data=iss)   #箱線圖
relevel(iss$spray,ref="D")

#(2)因子水平重編碼
#Example: we read in a vector where 1 stands for male, 2 stands for female and 0 stands for unknown
gender=c(2,1,1,2,0,1,1)
gender=factor(gender,level=c(1,2),label=c("male","female"))
gender

library(ggplot2)
(pg=PlantGrowth)  #ggPlot2數據
pg$group  #原來的分類有3類
pg$treatment[pg$group=="ctrl"]="no"
pg$treatment[pg$group=="trt1"]="yes"
pg$treatment[pg$group=="trt2"]="yes"
pg
str(pg)
pg$treatment=factor(pg$treatment)
str(pg)

##字符處理
#(1).string normalization: transform a varity strings to a set of standard strings 
#We expect it to be more easily processed later

library(stringr)
str_trim(" Hello world ")  #忽略前後空格
str_trim(" Hello world ",side="left")  #忽略左邊空格
str_trim("Hello world ",side="right")  #忽略右邊空格
str_pad(112,width=10,side="left",pad=0)  #把字符串填充爲指定的長度
toupper("Hello world") #小寫字母轉化爲大寫字母(to-upper)
tolower("Hello world") #大寫字母轉化爲小寫字母(to-lower)
#(2)模糊匹配
#模式匹配
gender=c("M","male","Female","fem.");gender
#grepl返回邏輯值,grep返回匹配到的位置索引
grepl("m",gender)    #大小寫敏感,返回邏輯值
grep("m",gender)   #大小寫敏感,返回數值索引
grepl("m",gender,ignore.case=TRUE) #參數ignore.case=TRUE,忽略大小寫
grepl("m",tolower(gender)) 
#匹配以m或M開頭的字符串
grepl("^m",gender,ignore.case=TRUE)

#查看“abc“變爲”bac”需要的步數(不能換位,只能替換)
adist("abc","bac")  
codes=c("male","female")
disMatrix=adist(gender,codes)
disMatrix
colnames(disMatrix)=codes  #for readability
rownames(disMatrix)=gender
disMatrix
i=apply(disMatrix,1,which.min);i  #按行輸出變換結果
data.frame(rawtext=gender,coded.gender=codes[i]) 
#stringdist()在計算字符串距離時比adist()更加方便,它允許字符的替換
install.packages("stringdist")
library(stringdist)
stringdist("abc","bac")
#amath() return an index to the closest match(codes) within a maximum distance
i=amatch(gender,codes,maxDist=4);i
data.frame(rawtext=gender,code=codes[i])

##日期轉化
(current_time=Sys.time())
class(current_time)
as.numeric(current_time)
date1=as.Date(current_time)
date1
as.numeric(date1)
end_time=Sys.time()
end_time-current_time #Running time of some program
install.packages("lubridate")
library(lubridate)  
#contain functions facilitating conversion of text to POSIXct date
dates=c("15/02/2013","15022013","01-07-2011","It happened on 15 02 13")
dmy(dates) #dmy轉換爲標準格式

##分組操作
#(1)apply(),lapply(),sapply(),mapply()
(ma=matrix(1:100,nrow=20))
#按行求和,等同於rowSums()
apply(ma,1,sum)
#按列求和,等同於colSums()
apply(ma,2,sum)
#添加缺失值的情況
ma[2,3]=NA
apply(ma,1,sum)
apply(ma,2,sum)
apply(ma,1,sum,na.rm=TRUE)
apply(ma,2,sum,na.rm=TRUE)

Thelist=list(A=matrix(1:9,nrow=3),B=1:5,C=matrix(1:4,nrow=2),D=c(2));Thelist
lapply(Thelist,sum)
sapply(Thelist,sum)
help(apply)

#(2)aggregate()
library(ggplot2)
data(diamonds)
diamonds
head(diamonds)
aggregate(price~cut,diamonds,mean)
aggregate(price~cut+color,diamonds,mean)
aggregate((price+carat)~cut+color,diamonds,mean)

#(3)plyr Package
library(plyr)
xx <- array(1:24, c(3, 4, 2));xx
class(xx)

#matrix
a=matrix(1:21,nrow=3,ncol=7);a
aaply(.data=a,.margins=1,.fun=mean)  #計算矩陣a各行均值
aaply(a,1,mean)  #計算矩陣a各行均值
aaply(a,2,mean)  #計算矩陣a各列均值

#data.frame
names=c("John","Mary","Alice","Peter","Roger","Phyillis") 
age=c(13,15,14,13,14,13) 
sex=c("Male","Female","Female","Male","Male","Female") 
data=data.frame(names,age,sex);data

aver=function(data)c(average.age=mean(data$age))
dlply(data,"sex",aver)  #返回列表
ddply(data,"sex",aver)  #返回數據框
daply(data,"sex",aver)  #返回向量

##baseball簡單案例
#Case study:  data(baseball)
#baseball數據集包括了15年及以上美國所有職業選手的擊球記錄
data(baseball)
head(baseball)
baseball[baseball$id=="yosted01",]  #輸出id爲“yosted01”的信息
#新增變量: OBP(On-Base Percentage,上壘率)
#OBP=(h+bb+hbp)/(ab+bb+hbp+sf)
baseball$sf[baseball$year<1954]  #查看year<1954的sf值
baseball$sf[baseball$year<1954]=0  #將year<1954的sf值賦值爲0
baseball$hbp[is.na(baseball$hbp)]=0  #set missing values to 0
#檢查是否存在缺失值
any(is.na(baseball$sf))
any(is.na(baseball$hbp))
#每年、每位選手的OBP值
#with()函數用來做批處理
baseball$OBP=with(baseball,(h+bb+hbp)/(ab+bb+hbp+sf))
tail(baseball)
#計算選手職業生涯中的OBP值
#OBP=sum(h+bb+hbp)/sum(ab+bb+hbp+sf)
obp=function(data) c(OBP=with(data,sum(h+bb+hbp)/sum(ab+bb+hbp+sf)))
obp(baseball[baseball$id=="aaronha01",])
careerOBP=ddply(baseball,"id",obp)
head(careerOBP)
arrange(careerOBP,OBP)  #排序

##整齊數據
#(1)列標題是值而不是變量名
#pew數據是教徒的收入數據,分隔符是"\t"

pew = read.delim(file = "pew.txt",header = TRUE,stringsAsFactors = FALSE,check.names = F)
pew
library(reshape2)
pew_tidy = melt(data = pew,id.vars = "religion",variable.name="income",value.name="frequency")
head(pew_tidy)

#(2)多個變量存儲在一列
tb = read.csv(file = "tb.csv",header = TRUE, stringsAsFactors = FALSE)
head(tb)
names(tb)
tb$new_sp = NULL  #clean up column names
names(tb)
names(tb) = gsub("new_sp_", "", names(tb))

# na.rm = TRUE移除缺失值
tb_tidy = melt(data = tb,id = c("iso2", "year"),variable.name = "gender_age",
               value.name = "cases",na.rm = TRUE)
#gender_age這一列包含兩個變量:性別和年齡段
head(tb_tidy)
# na.rm = TRUE可以保證按變量排序不受影響
tidy = arrange(tb_tidy, iso2, gender_age, year)
head(tidy)
library(stringr)
#str_sub()用來從一個特徵向量提取子字符串(stringr)包
#str_sub(string=,start=,end=)
str_sub(tidy$gender_age, 1, 1) 
str_sub(tidy$gender_age, 2)
ageraw=str_sub(tidy$gender_age, 2)
agemap= c("04" = "0-4", "514" = "5-14",
          "014" = "0-14", "1524" = "15-24", "2534" = "25-34",
          "3544" = "35-44", "4554" = "45-54", "5564" = "55-64",
          "65"= "65+", "u" = NA)
#revalue()函數作用:對於一個因子型或者字符型變量,給定一個映射關係,用新值替換指定值
age=revalue(ageraw,agemap)

tidy$sex = str_sub(tidy$gender_age, 1, 1)
tidy$age = factor(age)
tidy = tidy[c("iso2", "year", "sex", "age", "cases")]
head(tidy)

#(3)行、列中均存在變量
#weather是天氣氣溫的數據
weather = read.delim(file = "weather.txt",stringsAsFactors = FALSE)
head(weather)
raw1=melt(weather,id.vars=c("id","year","month","element"),
          na.rm = TRUE, variable.name="day",value.name = "temperature")
head(raw1)
#str_replace()函數將變量“day”中的“d”用“”代替,即去掉
raw1$day = as.integer(str_replace(raw1$day, "d", ""))
#tolower()函數將變量“element”中的值轉化爲小寫
raw1$element = tolower(raw1$element)
names(raw1)
#交換兩變量的順序
raw1 = raw1[c("id", "year", "month", "day","element", "temperature")]
raw1 = arrange(raw1, year, month, day, element)
head(raw1)
dcast(raw1,id+year+month+day~element,value.var="temperature")

#####4.修改數據#####

data = read.table("salary.txt",header = T);data

mode(data)
names(data)
dim(data)
data$Price
attach(data)
Price
Salary
mean(Salary)  #求均值
length(Salary)  #數據長度(個數)
cumsum(Salary)  #累積工資
detach(data)
Salary

#修改數據標籤
names(data)=c("CITY","WORK","PRICE","SALARY")
names(data)
#行列刪除
data2=data[-1,-3]
data2
#判斷缺失數據
attach(data)
is.na(SALARY)
#將data文件中工資指數大於65的值替換爲缺失值
data$SALARY = replace(SALARY,SALARY>65,NA)
is.na(SALARY)
#查看缺失值數量
sum(is.na(SALARY))
#complete.cases()函數
complete.cases(data$SALARY)  #數據是否非缺失
sum(!complete.cases(data$SALARY))
#判斷缺失模式
data$PRICE = replace(PRICE,PRICE>80,NA)
install.packages("mice")
library(mice)
md.pattern(data)

install.packages("VIM")
library(VIM)
aggr(data)


##(1)行刪除法
data("airquality")
head(airquality)
tail(airquality)
sum(any(is.na(airquality)))
airquality[complete.cases(airquality),]
##(2)成對刪除法
apply(airquality,2,mean,na.rm=TRUE)  #均值
cor(airquality,use="pair")  #相關係數矩陣
##(3)用統計量來填補缺失值
mean6 = apply(airquality,2,mean,na.rm = TRUE);mean6
#TRUE/FALSE"+1"是爲了使得值爲TRUE的變爲2,值爲FALSE的變爲1,觀察是否插補標識
airquality$col = c("Mean_imputation","notNA")[complete.cases(airquality[,1:2])+1]
#使用均值插補兩個變量
airquality[is.na(airquality$Ozone),"Ozone"] = mean6["Ozone"]
airquality[is.na(airquality$Solar.R),"Solar.R"] = mean6["Solar.R"]
#檢查插補後是否有缺失值
any(is.na(airquality))
#繪製插補後的Ozone直方圖
library(ggplot2)
ggplot(airquality,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(airquality,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
#插補後的標準誤
sd(airquality$Ozone)
#插補後Solar.R和Ozone的相關係數
cor(airquality$Ozone,airquality$Solar.R)
#重新加載airquality
data("airquality")
#插補前Ozone的標準誤
sd(airquality$Ozone,na.rm = TRUE)
#插補前Solar.R和Ozone的相關係數
cor(airquality$Ozone,airquality$Solar.R,use = "complete.obs")

##(4)迴歸插補
library(mice)
data("airquality")
airquality$col = c("regression_imputation","notNA")[as.vector(!is.na(airquality["Ozone"]))+1]
fit = lm(Ozone~Solar.R,data = airquality)
#篩選Ozone缺失的行號
a = which(!complete.cases(airquality$Ozone))  
#插補
airquality$Ozone[a] = as.vector(predict(fit,newdata = airquality[a,]))
ggplot(airquality,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(airquality,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
#插補後的標準誤
sd(airquality$Ozone,na.rm=TRUE)
#插補後Solar.R和Ozone的相關係數
cor(airquality$Ozone,airquality$Solar.R,use = "complete.obs")

##(5)隨機迴歸插補
library(mice)
data("airquality")
imp = mice(airquality[,1:2],method = "norm.nob",m=1,maxit = 1,seed = 11)
air = complete(imp)
air$col = c("norm.nob_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)

##(6)多重插補
library(mice)
data("airquality")
imp = mice(airquality,seed = 1,print = FALSE)
#使用with()函數依次對每個完整數據集做迴歸
fit = with(imp,lm(Ozone~Wind+Temp+Solar.R))
pooled = pool(fit)
round(summary(pooled),3)[,c(1:3,5)]
#使用原數據集做迴歸
fit.r = lm(Ozone~Wind+Temp+Solar.R,data=airquality)
round(coef(summary(fit.r)),3)

#觀察實際插補值
imp$imp
#顯示實際插補值的得變量Ozone的值,5列表示5個值
imp$imp$Ozone
#complete()函數可以觀察m個插補數據集中的任何一個
air = complete(imp,action = 1)
air$col = c("multiple_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)

##(7)K近鄰法
install.packages("DMwR")
library(DMwR)
data("airquality")
air = knnImputation(airquality,k=10)
air$col = c("knn_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)


#####5.異常點的檢測#####
##(1)單變量
set.seed(0402)
x = rnorm(100) #生成100個標準正態分佈的隨機數
boxplot.stats(x)$out #檢測出來的異常點
boxplot(x) #繪製箱線圖

##(2)兩變量
set.seed(3148)
x = rnorm(100)
set.seed(3147)
y = rnorm(100)
df = data.frame(x,y)
attach(df)
#分別找出兩變量異常點的索引
(a = which(x %in% boxplot.stats(x)$out))
(b = which(y %in% boxplot.stats(y)$out))
detach(df)
#交集
(outlier.list1 = intersect(a, b))
plot(df)
points(df[outlier.list1,], col="red", pch="+", cex=2.5)
#並集
(outlier.list2 = union(a, b))
plot(df)
points(df[outlier.list2,], col="blue", pch="+", cex=2.5)

##(3)3個及以上變量
##局部離羣點因子(LOF)
library(DMwR)
iris2 = iris[,1:4] #刪除列變量Species,它是一個分類型變量
outlier.scores = lofactor(iris2, k=5) #選擇k=5作爲近鄰標準,用於計算LOF
dec_out = outlier.scores[order(outlier.scores,decreasing = T)];dec_out
#按LOF降序排列,將前5個點作爲離羣點
outliers = order(outlier.scores,decreasing = T)[1:5] 
#輸出異常點編號
print(outliers)

n = nrow(iris2)
labels = 1:n
labels[-outliers] = "."
#結合前兩個主成份的雙標圖呈現異常值
#prcomp()執行了一個主成分分析,並且biplot()使用前兩個主成分畫出了這些數據
biplot(prcomp(iris2), cex=.6, xlabs = labels)

#使用pairsPlot顯示異常值
pch = rep(".", n)
pch[outliers] = "+"
col = rep("black", n)
col[outliers] = "red"
pairs(iris2,col=col,pch=pch)

##K-means算法檢測離羣點
iris2 = iris[,1:4] #刪除列變量Species,它是一個分類型變量
kmeans.result = kmeans(iris2, centers = 3)
#聚類中心
kmeans.result$centers
#類別標籤
kmeans.result$cluster
#分配每行數據的聚類中心
centers = kmeans.result$centers[kmeans.result$cluster,]
centers
#計算各點與聚類中心的距離
distances = sqrt(rowSums((iris2-centers)^2))
#按聚類降序排列,將前5個點作爲離羣點
outliers = order(distances,decreasing = T)[1:5] 
#輸出異常點編號
print(outliers)
#以花萼長寬爲座標畫出聚類情況
plot(iris2[,c("Sepal.Length","Sepal.Width")], pch="o",col=kmeans.result$cluster,cex=0.3)
#標記聚類中心
points(kmeans.result$centers[,c("Sepal.Length","Sepal.Width")], pch=8,col=1:3,cex=1.5)
#標記離羣點
points(iris2[outliers,c("Sepal.Length","Sepal.Width")], pch="+",col=4,cex=1.5)

#####6.變量選擇#####
#####過濾法#####
## 低方差變量處理
library(caret)
library(AppliedPredictiveModeling)
data(segmentationOriginal) #加載原始的細胞分割數據集
segData = subset(segmentationOriginal, Case == "Train") #提取其中標識爲“Train”的訓練樣本
dim(segData) #訓練樣本有1009個觀測,119個特徵
#刪除不需要的三列特徵:細胞標識ID(Cell)、是否正確分割(Class)和細胞用於測試集還是訓練集(Case)
segData = segData[,-(1:3)] 
#去除對本例無用的二元定性變量,它們的變量名都包含“status”
statusColNum = grep("Status", names(segData))
#刪掉定性變量列,得到本例用的數據
segData = segData[,-statusColNum]
#返回該數據中低方差變量所在的列數
nearZeroVar(segData)

## 刪除強相關變量
correlations = cor(segData)
dim(correlations)
correlations[1:4,1:4]  #查看前四個變量間的相關性
library(corrplot)
# 可視化展示相關係數矩陣,展示圖已根據變量聚類後的結果對變量進行重排
corrplot(correlations, order = "hclust")
# 根據以上算法篩選出相關性最強的變量
highCorr = findCorrelation(correlations, cutoff = 0.75)
length(highCorr)  # 篩選出的變量個數是32個
highCorr
# 去除強相關變量
filteredSegData = segData[,-highCorr]

## 用變量聚類的方法過濾變量
library(Hmisc)
v = varclus(as.matrix(segData))
print(round(v$sim, 2))  # 顯示變量的相關係數矩陣
plot(v)  # 顯示層次樹結構,可以看到很多變量之間有很強的相關性
#將變量聚成30個大類,而後在每個類中挑選一個變量
nvars = 30  
# 標記每類的類別編號(1-30)
tree = cutree(v$hclust,nvars)
# 統計每類的數量
tab = table(tree)
# 先建立長度爲30的全0向量,後面用來填充每類中的一個變量
predictors.select = rep(0,30)
for (i in 1:nvars)
{
  # 若某類中只有一個變量,則選擇該變量
  if (sum(tree == i) == 1)
    predictors.select[i] = names(tree[tree == i])
  # 若某類變量不止一個,隨機取一個變量
  else
    predictors.select[i] = names(sample(tree[tree == i], 1))
}
predictors.select  # 顯示隨機選擇的30個變量

#####變量重要性排序#####
#####(1)輸入變量和輸出變量都是數值型變量#####
library(AppliedPredictiveModeling)
data(solubility)
## 單變量與因變量的pearson相關係數
cor(solTrainXtrans$NumCarbon, solTrainY)

## 所有數值型變量與因變量的pearson相關係數
# 變量名中包含“FP”的變量是分類變量,將匹配出來並排除掉剩餘的就是數值型變量
fpCols = grepl("FP", names(solTrainXtrans))
numericPreds = names(solTrainXtrans)[!fpCols]   #所有的數值型自變量
# 利用apply函數計算所有數值型變量與因變量solTrainY的pearson相關係數
corrValues = apply(solTrainXtrans[, numericPreds],
                   MARGIN = 2, #1表示按行計算,2表示按列計算
                   FUN = function(x, y) cor(x, y),
                   y = solTrainY)
head(corrValues)  #查看前六個
## 所有數值型變量與因變量的spearman相關係數
corrValues1 = apply(solTrainXtrans[, numericPreds],
                    MARGIN = 2,
                    FUN = function(x, y) cor(x, y,method = "spearman"),
                    y = solTrainY)
head(corrValues1) #查看前六個

## 局部加權迴歸LOESS的僞R2
smoother = loess(solTrainY ~ solTrainXtrans$NumCarbon)
smoother
#lattice包中的xyplot做LOESS圖
library(lattice)
xyplot(solTrainY ~ solTrainXtrans$NumCarbon,
       type = c("p", "smooth"),
       xlab = "# Carbons",
       ylab = "Solubility")
#caret包中的filterVarImp
install.packages("caret")
library(caret)
loessResults = filterVarImp(x = solTrainXtrans[, numericPreds],
                            y = solTrainY,
                            nonpara = TRUE)
head(loessResults)
# 按照變量重要性排序,越重要序號越大
aaa = cbind(loessResults,rank(loessResults$Overall))

## 最大信息係數MIC
install.packages("minerva")
library(minerva)
micValues = mine(solTrainXtrans[, numericPreds], solTrainY)
# 計算出若干統計量,其中包括MIC
names(micValues)
head(micValues$MIC)
bbb = cbind(micValues$MIC, rank(micValues$MIC))
cbind(aaa,bbb)

#####(2)輸入變量是分類變量輸出變量是數值型變量#####
# 查看數據集分類變量的類別數
get_levels = function(x)
{
  out = levels(factor(x))
  out
}
FP_levels = apply(solTrainXtrans[, fpCols],
                  MARGIN = 2,
                  FUN = get_levels)
FP_levels = as.data.frame(t(FP_levels))
#按照FP044分兩類,檢驗因變量均值是否相同
t.test(solTrainY ~ solTrainXtrans$FP044)
levels(factor(solTrainXtrans$FP002))
#分別按照FPxxx分兩類,檢驗因變量均值是否相同,並輸出t值和p值
getTstats = function(x, y)
{
  tTest = t.test(y~x)
  out = c(tStat = tTest$statistic, p = tTest$p.value)
  out
}
tVals = apply(solTrainXtrans[, fpCols],
              MARGIN = 2,
              FUN = getTstats,
              y = solTrainY)
## 轉置以方便查看
tVals1 = as.data.frame(t(tVals))

head(tVals1)
# 篩選不能拒絕原假設的分類變量
uselessFP = tVals1[tVals1$p>0.05,]
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章