R語言筆記
#設定R軟件當前工作目錄
setwd("E:/R work")
#顯示R軟件當前工作目錄
getwd()
#R語言數據預處理常用包安裝
#plyr,reshape2,lubridate, stringr
install.packages(c("plyr","reshape2","lubridate", "stringr","foreign"))
library(MASS)
library(foreign)
library(stringr)
library(plyr)
library(reshape2)
library(ggplot2)
#####1.R語言數據讀取#####
#R包自帶數據
data(diamonds)
diamonds
#查看前六行數據
head(diamonds)
#查看後六行數據
tail(diamonds)
#R添加包,可以從一些開放源直接下載金融數據,包括雅虎財經、谷歌財經、等
install.packages("quantmod")
library(quantmod) #加載包
#從雅虎財經下載蘋果股票交易數據(從2015年1月1日至今)
getSymbols("AAPL",from="2015-01-01")
#查看數組維數及元素個數
dim(AAPL)
head(AAPL)
tail(AAPL)
#作圖,K線圖
chartSeries(AAPL,theme=chartTheme('black'))
#從oanda獲取外匯數據
install.packages("jsonlite")
library(jsonlite)
getFX("USD/CNY",from="2017-05-01")
head(USDCNY)
tail(USDCNY)
chartSeries(USDCNY,theme = chartTheme('black'))
#read.table函數讀取本地/網絡數據(read.table, read.csv, read.csv2, read.delim, read.delim2, read.fwf)
help("read.table")
##read.table函數
getwd()
#原始數據有列名,第一列爲記錄序號,可以省略參數header(但此時應當爲TRUE)
rt = read.table("houses.data");rt
rt1 = read.table("houses.data",header = TRUE);rt1
#原始數據有列名,無記錄序號列,不可以省略參數header
rt2 = read.table("houses2.data",header = TRUE);rt2
rt2 = read.table("houses2.data");rt2 # 省略參數header(此時爲FALSE),變量名會被認爲是一行數據
#原始數據無列名,無記錄序號列,可以省略參數header(此時爲FALSE)
rt3 = read.table("houses3.data");rt3
rt3 = read.table("houses3.data",
col.names = c("Price","Floor","Area","Rooms","Age","Cent.heat"));rt3
#read.csv函數
dat = read.csv('PM.csv') #編碼錯誤,讀入亂碼,行數也會錯亂
dat1 = read.csv('PM.csv',fileEncoding = "utf-8") #指定正確編碼
#以下操作不讀取表頭,並重新制定列名
colname=c('id','city','index','y','x')
dat2 = read.csv('PM.csv',header=FALSE,col.name=colname,fileEncoding = "utf-8")
#當數據量較大時,全部將數據讀取會比較耗時,這裏可以通過nrows設定
dat3 = read.csv('PM.csv',fileEncoding = "utf-8",nrows=-1) #nrows默認爲-1
dat4 = read.csv('PM.csv',fileEncoding = "utf-8",nrows=5) #nrows設置爲5
#因子轉換
dat5 = read.csv('PM.csv',stringsAsFactors=FALSE,fileEncoding = "utf-8") #讀取爲string格式
str(dat5)
dat6 = read.csv('PM.csv',fileEncoding = "utf-8") #讀取爲factor格式
str(dat6)
#文件編碼
dat7 = read.csv('PM.csv',fileEncoding = "utf-8") #默認編碼不是utf-8,需要設置
dat8 = read.csv('PM-gbk.csv') #這裏默認編碼是gbk,不需要設置
#最後一行沒有回車符會有警告“最後一行不完整”
x=read.table("data1.txt",sep=",");x
person=read.csv("data1.txt", header=FALSE,col.names=c("age","height"))
person
##scan函數讀取結構化數據
#15名學生的體重
w = scan("weight.data");w #默認讀爲數值向量
w = scan("weight.data",what = 0);w
w = scan("weight.data",what = c(""));w #讀爲字符型向量
w = scan("weight.data",what = list(""));w #讀爲list
#例100名學生的身高和體重被存在文件h_w.data中,其中1,3,5,7,9列爲身高,2,4,6,8,10列爲體重,
#試用scan函數讀入,並轉化爲數據框
dat = scan("h_w.data",what = list(height=0,weight=0))
df = as.data.frame(dat)
#scan函數讀入屏幕數據
names = scan(what = "")
zhangsan lisi wangwu maliu
names
##其他格式數據讀入
install.packages("foreign")
library(foreign)
#讀取SPSS文件,不加參數to.data.frame = T返回list
educ = read.spss("educ_scores.sav",to.data.frame = T)
educ = read.xport("educ_scores.xpt") #讀取SAS文件
educ = read.S("educ_scores") #讀取SPLUS文件
educ = read.dta("educ_scores.dta") #讀取stata文件
#讀取excel表格數據
educ = read.delim("EDUC_SCORES.txt") #轉化爲txt文件
educ = read.csv("educ_scores.csv") #轉化爲csv文件
#利用xlsx包中的函數讀取
install.packages("xlsx")
library(xlsx)
#解決無法載入‘rJava’問題方法
install.packages("rJava")
Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_77') #自己的JAVA64路徑
library(rJava)
library(xlsx)
#這裏默認header=T,sheetIndex = 1表示讀取第一個工作簿的數據,或通過指定工作簿名稱來讀取
educ = read.xlsx("educ_scores.xls",sheetIndex = 1)
educ = read.xlsx("educ_scores.xls",sheetName = "educ_scores")
##文本數據讀取
news = readLines('news.txt',encoding = "UTF-8")
news = readLines('news.txt',n=2,encoding = "UTF-8");news
#scan函數讀取爲列表
line = scan('news.txt',what=list(''),encoding = "UTF-8")
line = scan('news.txt',what=list(''),n=1,encoding = "UTF-8");line
#scan函數讀取爲向量
line = scan('news.txt',what=c(''),encoding = "UTF-8")
line = scan('news.txt',what=c(''),n=1,encoding = "UTF-8");line
##結構化數據寫入
write.table(educ,file = "educ_w.txt",append = T)
write.csv(educ,file = "educ_w.csv")
##文本數據寫入
writeLines(line,"news_w.txt")
sink("news_w1.txt")
cat(line)
sink()
y=read.table("http://www.jaredlander.com/data/Tomato%20First.csv",header=TRUE,sep=",")
#使用head(),str(),summary()函數來查看數據集
head(y)
str(y)
summary(y)
getwd()
#查看數據
data = read.table("salary.txt",header = T);data
mode(data)
class(data)
names(data)
colnames(data)
dim(data)
#####2.數據管理與變換######
##數據合併
a=c("Hongkong",1910,75.0,41.8)
data = read.table('salary.txt', header = T,stringsAsFactors = F)
data1=rbind(data,a)
data1[14:16,]
weight=c(150,135,210,140) #數值型向量
height=c(65,61,70,65)
gender=c("F","F","M","F") #字符型向量
stu=data.frame(weight,height,gender)
row.names(stu)=c("Alice","Bob","Cal","David")
stu[,"weight"]
stu["Cal",] #獲取行
stu[1:2,1:2]
stu$weight # ”$”用於取列
stu[["weight"]] #雙括號+名稱
stu[[1]] #雙括號+下標,用於數據框和列表數據的獲取
stu[,1]
#列名一致
index=list("City"=data$City,"Index"=1:15) #建立另一個數據集index
index
data.index=merge(data,index,by="City")
data.index
#列名不一致
index1=list("City1"=data$City,"Index"=1:15)
index1
data.index1=merge(data,index1,by.x = "City",by.y = "City1")
data.index1
index2 = 1:15
data.index2=cbind(data,index2)
##選取數據子集
data[data$Salary>65,]
data[c(2,4),]
#選取價格指數等於65.6的行,注意要用雙等號==
data[data$Price==65.6,]
##數據排序
order.salary=order(data$Salary) #返回的是該變量從小到大(默認)排序後的索引
order.salary
data[order.salary,]
sort.list(data$Salary) #sort.list與order的作用一致
data[sort.list(data$Salary,decreasing = T),]
## 讀取數據
# 在當前目錄下搜索匹配文件名中有“Loan”的貸款申請成功數據
setwd("G:\\數據預處理")
thefilesL = dir(pattern = "^Loan");thefilesL
# 讀入各數據並將其放在同一個列表(list),若不指定參數stringsAsFactors = F,字符型的元數據將自動轉化爲因子型
# lapply函數對列表進行統一操作(R語言基礎PPT54)
# 第一行是描述性數據,需要跳過
LoanList0 = lapply(thefilesL, read.csv, stringsAsFactors = F, skip = 1)
# 按行合併不同的csv文件的貸款申請數據
Loan = do.call(rbind, LoanList0)
# 同樣讀入、合併Reject
thefilesR = dir(pattern = "^Reject")
RejectList0 = lapply(thefilesR, read.csv, stringsAsFactors = F, skip = 1)
Reject = do.call(rbind, RejectList0)
str(Loan)
library(dplyr)
Loan.df = tbl_df(Loan)
Loan.df
dim(Loan.df)
colnames(Loan.df)
## (1)添加新變量列
# 添加一列名爲dti的新變量,它是將變量列Debt.To.Income.Ratio去百分號得到的
#這裏sub函數用來將“%”替換爲“”
Reject.temp = mutate(Reject, dti = as.numeric(sub("%", "", Debt.To.Income.Ratio)))
#等同於下面的操作
Reject.temp1 = Reject
Reject.temp1$dti = as.numeric(sub("%", "", Reject.temp1$Debt.To.Income.Ratio))
## (2)選擇變量列
Reject.s = select(Reject.temp, Amount.Requested, dti, Risk_Score:State)
Reject.s1 = select(Reject.s, -Zip.Code, -Debt.To.Income.Ratio)
## (3)選擇滿足條件的觀測行
MA_Reject = filter(Reject.s1, Risk_Score>500&State == "MA")
## (4)排序
arrange(Reject.s1, State, Risk_Score, dti, Amount.Requested)
## (5)數據分組彙總
summarise(group_by(Loan,grade), #使用分類變量grade分組
ave.amnt = mean(funded_amnt, na.rm = T), #計算均值
sd = sd(funded_amnt, na.rm = T), #計算標準差
n = sum(!is.na(funded_amnt)), #計算各組樣本量(不計缺失值)
se = sd/sqrt(n), #計算均值標準誤
conf_upper = ave.amnt + qt(0.975, n-1)*se, #計算置信上下限(t分佈)
conf_lower = ave.amnt - qt(0.975, n-1)*se)
## 數據變換
# (1)最大值-最小值規範化
library(caret)
# 將Loan數據中的loan_amnt轉化到[0,1]
help(preProcess)
#先指定處理方法
trans = preProcess(select(Loan, loan_amnt), method = c("range"))
trans
#然後使用predict函數完成處理
transformed = predict(trans, select(Loan,loan_amnt))
head(transformed)
range(transformed)
# (2)標準化
trans = preProcess(select(Loan, loan_amnt), method = c("center","scale"))
trans
transformed = predict(trans, select(Loan,loan_amnt))
head(transformed)
mean(transformed[[1]]);var(transformed[[1]])
# (3)十進制正規化
max(abs(Loan$loan_amnt))
# (4)Box-Cox變換
library(e1071)
#計算偏度,發現是右偏
skewness(Loan$annual_inc,na.rm = T)
#選擇Loan數據集中的數值型變量
Loan.num = select(Loan, loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,dti,total_pymnt)
# 對每列數值型變量都計算其偏度係數
apply(Loan.num,2,skewness,na.rm = T)
# 爲了直方圖顯示效果,剔除年收入超過40萬美元的客戶
Loan.anin = Loan$annual_inc[-which(Loan$annual_inc>400000)]
library(caret)
# 使用樣本數據估計λ,估計值爲-0.1,但修正後的λ估計值爲0
BoxCoxTrans(Loan$annual_inc,na.rm = T)
par(mfrow=c(1,2))
hist(Loan.anin,xlab="natural units of annual_inc", main="Histogram: Original Data")
# 估計的λ爲0,使用log變換
hist(log(Loan$annual_inc),
xlab = "log units of annual_inc",
main = "Histogram: after log transformation"
)
#####3.從原始數據到技術正確的數據#####
##一個小案例(deltons)
#step(1): Reading data
txt=readLines("data2.txt") #readLines: when the rows in a data files are not uniformly formatted
txt
#step(2):Selecting lines containing data
I=grepl("^%",txt)
I
dat=txt[!I]
dat
#step(3):Split lines into separate fields
help(strsplit)
(fieldList=strsplit(dat,split=","))
#step(4):Standardize rows
#先定義一個對列表中單個元素處理的
assignFields=function(x) #函數聲明
{
out=character(3)
#匹配list中的字符作爲輸出的第一列
i=grepl("[[:alpha:]]",x)
#print(i)
out[1]=x[i]
#將list中小於1890的作爲出生年份
i=which(as.numeric(x)<1890)
#print(i)
out[2]=ifelse(length(i)>0,x[i],NA) #若長度不大於0,則賦值爲NA
#將list中大於1890的作爲死亡年份
i=which(as.numeric(x)>1890)
#print(i)
out[3]=ifelse(length(i)>0,x[i],NA) #若長度不大於0,則賦值爲NA
return(out)
}
#演示
out=character(3)
out[1]
i=grepl("[[:alpha:]]",fieldList[[1]]);i
out[1] = fieldList[[1]][i];out
i=which(as.numeric(fieldList[[1]])<1890);i
out[2]=ifelse(length(i)>0,fieldList[[1]][i],NA);out
#lapply函數用來處理列表的每一個元素
standardFields=lapply(fieldList,assignFields) #apply a function over a list
standardFields
#step(5): transform a list to data.frame(將list轉化爲data.frame)
M=matrix(unlist(standardFields),nrow=length(standardFields),byrow=TRUE) #copy into a matrix which is then coerced into a data.frame
#unlist() produce a vector which contains all the atomic components which occur in x
colnames(M)=c("name","birth","death")
M
deltons=as.data.frame(M,stringsAsFactors=FALSE) #stringsAsFactors=FALSE 防止R把第一列默認成因子模式factor
deltons
#step(6):Normalize and coerce to correct types(強制轉換類型)
str(deltons)
deltons$birth=as.numeric(deltons$birth)
deltons$death=as.numeric(deltons$death)
deltons
str(deltons)
##分類變量處理
#分類型變量在R中存儲爲factor格式
#(1)改變因子水平排序
f=factor(c("small","large","large","small","medium")); f
levels(f) #默認是字母表順序
#手動輸入改變
f1=factor(f,levels=c("small","medium","large")); f1
#rev函數逆轉原來的排序
f2=factor(f1,levels=rev(levels(f1))); f2
#relevel函數決定因子水平從哪一個開始
f3 = relevel(f2,ref="small"); f3
##根據數值型變量改變因子水平排序,函數:reorder
iss=InsectSprays #R包數據:昆蟲噴霧劑
iss
#未重新排序前畫箱線圖,按照默認順序排序
iss$spray
boxplot(count~spray,data=iss) #箱線圖
#重新排序後箱線圖按照count的均值從小到大排序
iss$spray=reorder(iss$spray,iss$count,FUN=mean)
iss$spray
boxplot(count~spray,data=iss) #箱線圖
relevel(iss$spray,ref="D")
#(2)因子水平重編碼
#Example: we read in a vector where 1 stands for male, 2 stands for female and 0 stands for unknown
gender=c(2,1,1,2,0,1,1)
gender=factor(gender,level=c(1,2),label=c("male","female"))
gender
library(ggplot2)
(pg=PlantGrowth) #ggPlot2數據
pg$group #原來的分類有3類
pg$treatment[pg$group=="ctrl"]="no"
pg$treatment[pg$group=="trt1"]="yes"
pg$treatment[pg$group=="trt2"]="yes"
pg
str(pg)
pg$treatment=factor(pg$treatment)
str(pg)
##字符處理
#(1).string normalization: transform a varity strings to a set of standard strings
#We expect it to be more easily processed later
library(stringr)
str_trim(" Hello world ") #忽略前後空格
str_trim(" Hello world ",side="left") #忽略左邊空格
str_trim("Hello world ",side="right") #忽略右邊空格
str_pad(112,width=10,side="left",pad=0) #把字符串填充爲指定的長度
toupper("Hello world") #小寫字母轉化爲大寫字母(to-upper)
tolower("Hello world") #大寫字母轉化爲小寫字母(to-lower)
#(2)模糊匹配
#模式匹配
gender=c("M","male","Female","fem.");gender
#grepl返回邏輯值,grep返回匹配到的位置索引
grepl("m",gender) #大小寫敏感,返回邏輯值
grep("m",gender) #大小寫敏感,返回數值索引
grepl("m",gender,ignore.case=TRUE) #參數ignore.case=TRUE,忽略大小寫
grepl("m",tolower(gender))
#匹配以m或M開頭的字符串
grepl("^m",gender,ignore.case=TRUE)
#查看“abc“變爲”bac”需要的步數(不能換位,只能替換)
adist("abc","bac")
codes=c("male","female")
disMatrix=adist(gender,codes)
disMatrix
colnames(disMatrix)=codes #for readability
rownames(disMatrix)=gender
disMatrix
i=apply(disMatrix,1,which.min);i #按行輸出變換結果
data.frame(rawtext=gender,coded.gender=codes[i])
#stringdist()在計算字符串距離時比adist()更加方便,它允許字符的替換
install.packages("stringdist")
library(stringdist)
stringdist("abc","bac")
#amath() return an index to the closest match(codes) within a maximum distance
i=amatch(gender,codes,maxDist=4);i
data.frame(rawtext=gender,code=codes[i])
##日期轉化
(current_time=Sys.time())
class(current_time)
as.numeric(current_time)
date1=as.Date(current_time)
date1
as.numeric(date1)
end_time=Sys.time()
end_time-current_time #Running time of some program
install.packages("lubridate")
library(lubridate)
#contain functions facilitating conversion of text to POSIXct date
dates=c("15/02/2013","15022013","01-07-2011","It happened on 15 02 13")
dmy(dates) #dmy轉換爲標準格式
##分組操作
#(1)apply(),lapply(),sapply(),mapply()
(ma=matrix(1:100,nrow=20))
#按行求和,等同於rowSums()
apply(ma,1,sum)
#按列求和,等同於colSums()
apply(ma,2,sum)
#添加缺失值的情況
ma[2,3]=NA
apply(ma,1,sum)
apply(ma,2,sum)
apply(ma,1,sum,na.rm=TRUE)
apply(ma,2,sum,na.rm=TRUE)
Thelist=list(A=matrix(1:9,nrow=3),B=1:5,C=matrix(1:4,nrow=2),D=c(2));Thelist
lapply(Thelist,sum)
sapply(Thelist,sum)
help(apply)
#(2)aggregate()
library(ggplot2)
data(diamonds)
diamonds
head(diamonds)
aggregate(price~cut,diamonds,mean)
aggregate(price~cut+color,diamonds,mean)
aggregate((price+carat)~cut+color,diamonds,mean)
#(3)plyr Package
library(plyr)
xx <- array(1:24, c(3, 4, 2));xx
class(xx)
#matrix
a=matrix(1:21,nrow=3,ncol=7);a
aaply(.data=a,.margins=1,.fun=mean) #計算矩陣a各行均值
aaply(a,1,mean) #計算矩陣a各行均值
aaply(a,2,mean) #計算矩陣a各列均值
#data.frame
names=c("John","Mary","Alice","Peter","Roger","Phyillis")
age=c(13,15,14,13,14,13)
sex=c("Male","Female","Female","Male","Male","Female")
data=data.frame(names,age,sex);data
aver=function(data)c(average.age=mean(data$age))
dlply(data,"sex",aver) #返回列表
ddply(data,"sex",aver) #返回數據框
daply(data,"sex",aver) #返回向量
##baseball簡單案例
#Case study: data(baseball)
#baseball數據集包括了15年及以上美國所有職業選手的擊球記錄
data(baseball)
head(baseball)
baseball[baseball$id=="yosted01",] #輸出id爲“yosted01”的信息
#新增變量: OBP(On-Base Percentage,上壘率)
#OBP=(h+bb+hbp)/(ab+bb+hbp+sf)
baseball$sf[baseball$year<1954] #查看year<1954的sf值
baseball$sf[baseball$year<1954]=0 #將year<1954的sf值賦值爲0
baseball$hbp[is.na(baseball$hbp)]=0 #set missing values to 0
#檢查是否存在缺失值
any(is.na(baseball$sf))
any(is.na(baseball$hbp))
#每年、每位選手的OBP值
#with()函數用來做批處理
baseball$OBP=with(baseball,(h+bb+hbp)/(ab+bb+hbp+sf))
tail(baseball)
#計算選手職業生涯中的OBP值
#OBP=sum(h+bb+hbp)/sum(ab+bb+hbp+sf)
obp=function(data) c(OBP=with(data,sum(h+bb+hbp)/sum(ab+bb+hbp+sf)))
obp(baseball[baseball$id=="aaronha01",])
careerOBP=ddply(baseball,"id",obp)
head(careerOBP)
arrange(careerOBP,OBP) #排序
##整齊數據
#(1)列標題是值而不是變量名
#pew數據是教徒的收入數據,分隔符是"\t"
pew = read.delim(file = "pew.txt",header = TRUE,stringsAsFactors = FALSE,check.names = F)
pew
library(reshape2)
pew_tidy = melt(data = pew,id.vars = "religion",variable.name="income",value.name="frequency")
head(pew_tidy)
#(2)多個變量存儲在一列
tb = read.csv(file = "tb.csv",header = TRUE, stringsAsFactors = FALSE)
head(tb)
names(tb)
tb$new_sp = NULL #clean up column names
names(tb)
names(tb) = gsub("new_sp_", "", names(tb))
# na.rm = TRUE移除缺失值
tb_tidy = melt(data = tb,id = c("iso2", "year"),variable.name = "gender_age",
value.name = "cases",na.rm = TRUE)
#gender_age這一列包含兩個變量:性別和年齡段
head(tb_tidy)
# na.rm = TRUE可以保證按變量排序不受影響
tidy = arrange(tb_tidy, iso2, gender_age, year)
head(tidy)
library(stringr)
#str_sub()用來從一個特徵向量提取子字符串(stringr)包
#str_sub(string=,start=,end=)
str_sub(tidy$gender_age, 1, 1)
str_sub(tidy$gender_age, 2)
ageraw=str_sub(tidy$gender_age, 2)
agemap= c("04" = "0-4", "514" = "5-14",
"014" = "0-14", "1524" = "15-24", "2534" = "25-34",
"3544" = "35-44", "4554" = "45-54", "5564" = "55-64",
"65"= "65+", "u" = NA)
#revalue()函數作用:對於一個因子型或者字符型變量,給定一個映射關係,用新值替換指定值
age=revalue(ageraw,agemap)
tidy$sex = str_sub(tidy$gender_age, 1, 1)
tidy$age = factor(age)
tidy = tidy[c("iso2", "year", "sex", "age", "cases")]
head(tidy)
#(3)行、列中均存在變量
#weather是天氣氣溫的數據
weather = read.delim(file = "weather.txt",stringsAsFactors = FALSE)
head(weather)
raw1=melt(weather,id.vars=c("id","year","month","element"),
na.rm = TRUE, variable.name="day",value.name = "temperature")
head(raw1)
#str_replace()函數將變量“day”中的“d”用“”代替,即去掉
raw1$day = as.integer(str_replace(raw1$day, "d", ""))
#tolower()函數將變量“element”中的值轉化爲小寫
raw1$element = tolower(raw1$element)
names(raw1)
#交換兩變量的順序
raw1 = raw1[c("id", "year", "month", "day","element", "temperature")]
raw1 = arrange(raw1, year, month, day, element)
head(raw1)
dcast(raw1,id+year+month+day~element,value.var="temperature")
#####4.修改數據#####
data = read.table("salary.txt",header = T);data
mode(data)
names(data)
dim(data)
data$Price
attach(data)
Price
Salary
mean(Salary) #求均值
length(Salary) #數據長度(個數)
cumsum(Salary) #累積工資
detach(data)
Salary
#修改數據標籤
names(data)=c("CITY","WORK","PRICE","SALARY")
names(data)
#行列刪除
data2=data[-1,-3]
data2
#判斷缺失數據
attach(data)
is.na(SALARY)
#將data文件中工資指數大於65的值替換爲缺失值
data$SALARY = replace(SALARY,SALARY>65,NA)
is.na(SALARY)
#查看缺失值數量
sum(is.na(SALARY))
#complete.cases()函數
complete.cases(data$SALARY) #數據是否非缺失
sum(!complete.cases(data$SALARY))
#判斷缺失模式
data$PRICE = replace(PRICE,PRICE>80,NA)
install.packages("mice")
library(mice)
md.pattern(data)
install.packages("VIM")
library(VIM)
aggr(data)
##(1)行刪除法
data("airquality")
head(airquality)
tail(airquality)
sum(any(is.na(airquality)))
airquality[complete.cases(airquality),]
##(2)成對刪除法
apply(airquality,2,mean,na.rm=TRUE) #均值
cor(airquality,use="pair") #相關係數矩陣
##(3)用統計量來填補缺失值
mean6 = apply(airquality,2,mean,na.rm = TRUE);mean6
#TRUE/FALSE"+1"是爲了使得值爲TRUE的變爲2,值爲FALSE的變爲1,觀察是否插補標識
airquality$col = c("Mean_imputation","notNA")[complete.cases(airquality[,1:2])+1]
#使用均值插補兩個變量
airquality[is.na(airquality$Ozone),"Ozone"] = mean6["Ozone"]
airquality[is.na(airquality$Solar.R),"Solar.R"] = mean6["Solar.R"]
#檢查插補後是否有缺失值
any(is.na(airquality))
#繪製插補後的Ozone直方圖
library(ggplot2)
ggplot(airquality,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(airquality,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
#插補後的標準誤
sd(airquality$Ozone)
#插補後Solar.R和Ozone的相關係數
cor(airquality$Ozone,airquality$Solar.R)
#重新加載airquality
data("airquality")
#插補前Ozone的標準誤
sd(airquality$Ozone,na.rm = TRUE)
#插補前Solar.R和Ozone的相關係數
cor(airquality$Ozone,airquality$Solar.R,use = "complete.obs")
##(4)迴歸插補
library(mice)
data("airquality")
airquality$col = c("regression_imputation","notNA")[as.vector(!is.na(airquality["Ozone"]))+1]
fit = lm(Ozone~Solar.R,data = airquality)
#篩選Ozone缺失的行號
a = which(!complete.cases(airquality$Ozone))
#插補
airquality$Ozone[a] = as.vector(predict(fit,newdata = airquality[a,]))
ggplot(airquality,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(airquality,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
#插補後的標準誤
sd(airquality$Ozone,na.rm=TRUE)
#插補後Solar.R和Ozone的相關係數
cor(airquality$Ozone,airquality$Solar.R,use = "complete.obs")
##(5)隨機迴歸插補
library(mice)
data("airquality")
imp = mice(airquality[,1:2],method = "norm.nob",m=1,maxit = 1,seed = 11)
air = complete(imp)
air$col = c("norm.nob_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
##(6)多重插補
library(mice)
data("airquality")
imp = mice(airquality,seed = 1,print = FALSE)
#使用with()函數依次對每個完整數據集做迴歸
fit = with(imp,lm(Ozone~Wind+Temp+Solar.R))
pooled = pool(fit)
round(summary(pooled),3)[,c(1:3,5)]
#使用原數據集做迴歸
fit.r = lm(Ozone~Wind+Temp+Solar.R,data=airquality)
round(coef(summary(fit.r)),3)
#觀察實際插補值
imp$imp
#顯示實際插補值的得變量Ozone的值,5列表示5個值
imp$imp$Ozone
#complete()函數可以觀察m個插補數據集中的任何一個
air = complete(imp,action = 1)
air$col = c("multiple_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
##(7)K近鄰法
install.packages("DMwR")
library(DMwR)
data("airquality")
air = knnImputation(airquality,k=10)
air$col = c("knn_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#繪製插補後的Solar.R和Ozone的散點圖
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
#####5.異常點的檢測#####
##(1)單變量
set.seed(0402)
x = rnorm(100) #生成100個標準正態分佈的隨機數
boxplot.stats(x)$out #檢測出來的異常點
boxplot(x) #繪製箱線圖
##(2)兩變量
set.seed(3148)
x = rnorm(100)
set.seed(3147)
y = rnorm(100)
df = data.frame(x,y)
attach(df)
#分別找出兩變量異常點的索引
(a = which(x %in% boxplot.stats(x)$out))
(b = which(y %in% boxplot.stats(y)$out))
detach(df)
#交集
(outlier.list1 = intersect(a, b))
plot(df)
points(df[outlier.list1,], col="red", pch="+", cex=2.5)
#並集
(outlier.list2 = union(a, b))
plot(df)
points(df[outlier.list2,], col="blue", pch="+", cex=2.5)
##(3)3個及以上變量
##局部離羣點因子(LOF)
library(DMwR)
iris2 = iris[,1:4] #刪除列變量Species,它是一個分類型變量
outlier.scores = lofactor(iris2, k=5) #選擇k=5作爲近鄰標準,用於計算LOF
dec_out = outlier.scores[order(outlier.scores,decreasing = T)];dec_out
#按LOF降序排列,將前5個點作爲離羣點
outliers = order(outlier.scores,decreasing = T)[1:5]
#輸出異常點編號
print(outliers)
n = nrow(iris2)
labels = 1:n
labels[-outliers] = "."
#結合前兩個主成份的雙標圖呈現異常值
#prcomp()執行了一個主成分分析,並且biplot()使用前兩個主成分畫出了這些數據
biplot(prcomp(iris2), cex=.6, xlabs = labels)
#使用pairsPlot顯示異常值
pch = rep(".", n)
pch[outliers] = "+"
col = rep("black", n)
col[outliers] = "red"
pairs(iris2,col=col,pch=pch)
##K-means算法檢測離羣點
iris2 = iris[,1:4] #刪除列變量Species,它是一個分類型變量
kmeans.result = kmeans(iris2, centers = 3)
#聚類中心
kmeans.result$centers
#類別標籤
kmeans.result$cluster
#分配每行數據的聚類中心
centers = kmeans.result$centers[kmeans.result$cluster,]
centers
#計算各點與聚類中心的距離
distances = sqrt(rowSums((iris2-centers)^2))
#按聚類降序排列,將前5個點作爲離羣點
outliers = order(distances,decreasing = T)[1:5]
#輸出異常點編號
print(outliers)
#以花萼長寬爲座標畫出聚類情況
plot(iris2[,c("Sepal.Length","Sepal.Width")], pch="o",col=kmeans.result$cluster,cex=0.3)
#標記聚類中心
points(kmeans.result$centers[,c("Sepal.Length","Sepal.Width")], pch=8,col=1:3,cex=1.5)
#標記離羣點
points(iris2[outliers,c("Sepal.Length","Sepal.Width")], pch="+",col=4,cex=1.5)
#####6.變量選擇#####
#####過濾法#####
## 低方差變量處理
library(caret)
library(AppliedPredictiveModeling)
data(segmentationOriginal) #加載原始的細胞分割數據集
segData = subset(segmentationOriginal, Case == "Train") #提取其中標識爲“Train”的訓練樣本
dim(segData) #訓練樣本有1009個觀測,119個特徵
#刪除不需要的三列特徵:細胞標識ID(Cell)、是否正確分割(Class)和細胞用於測試集還是訓練集(Case)
segData = segData[,-(1:3)]
#去除對本例無用的二元定性變量,它們的變量名都包含“status”
statusColNum = grep("Status", names(segData))
#刪掉定性變量列,得到本例用的數據
segData = segData[,-statusColNum]
#返回該數據中低方差變量所在的列數
nearZeroVar(segData)
## 刪除強相關變量
correlations = cor(segData)
dim(correlations)
correlations[1:4,1:4] #查看前四個變量間的相關性
library(corrplot)
# 可視化展示相關係數矩陣,展示圖已根據變量聚類後的結果對變量進行重排
corrplot(correlations, order = "hclust")
# 根據以上算法篩選出相關性最強的變量
highCorr = findCorrelation(correlations, cutoff = 0.75)
length(highCorr) # 篩選出的變量個數是32個
highCorr
# 去除強相關變量
filteredSegData = segData[,-highCorr]
## 用變量聚類的方法過濾變量
library(Hmisc)
v = varclus(as.matrix(segData))
print(round(v$sim, 2)) # 顯示變量的相關係數矩陣
plot(v) # 顯示層次樹結構,可以看到很多變量之間有很強的相關性
#將變量聚成30個大類,而後在每個類中挑選一個變量
nvars = 30
# 標記每類的類別編號(1-30)
tree = cutree(v$hclust,nvars)
# 統計每類的數量
tab = table(tree)
# 先建立長度爲30的全0向量,後面用來填充每類中的一個變量
predictors.select = rep(0,30)
for (i in 1:nvars)
{
# 若某類中只有一個變量,則選擇該變量
if (sum(tree == i) == 1)
predictors.select[i] = names(tree[tree == i])
# 若某類變量不止一個,隨機取一個變量
else
predictors.select[i] = names(sample(tree[tree == i], 1))
}
predictors.select # 顯示隨機選擇的30個變量
#####變量重要性排序#####
#####(1)輸入變量和輸出變量都是數值型變量#####
library(AppliedPredictiveModeling)
data(solubility)
## 單變量與因變量的pearson相關係數
cor(solTrainXtrans$NumCarbon, solTrainY)
## 所有數值型變量與因變量的pearson相關係數
# 變量名中包含“FP”的變量是分類變量,將匹配出來並排除掉剩餘的就是數值型變量
fpCols = grepl("FP", names(solTrainXtrans))
numericPreds = names(solTrainXtrans)[!fpCols] #所有的數值型自變量
# 利用apply函數計算所有數值型變量與因變量solTrainY的pearson相關係數
corrValues = apply(solTrainXtrans[, numericPreds],
MARGIN = 2, #1表示按行計算,2表示按列計算
FUN = function(x, y) cor(x, y),
y = solTrainY)
head(corrValues) #查看前六個
## 所有數值型變量與因變量的spearman相關係數
corrValues1 = apply(solTrainXtrans[, numericPreds],
MARGIN = 2,
FUN = function(x, y) cor(x, y,method = "spearman"),
y = solTrainY)
head(corrValues1) #查看前六個
## 局部加權迴歸LOESS的僞R2
smoother = loess(solTrainY ~ solTrainXtrans$NumCarbon)
smoother
#lattice包中的xyplot做LOESS圖
library(lattice)
xyplot(solTrainY ~ solTrainXtrans$NumCarbon,
type = c("p", "smooth"),
xlab = "# Carbons",
ylab = "Solubility")
#caret包中的filterVarImp
install.packages("caret")
library(caret)
loessResults = filterVarImp(x = solTrainXtrans[, numericPreds],
y = solTrainY,
nonpara = TRUE)
head(loessResults)
# 按照變量重要性排序,越重要序號越大
aaa = cbind(loessResults,rank(loessResults$Overall))
## 最大信息係數MIC
install.packages("minerva")
library(minerva)
micValues = mine(solTrainXtrans[, numericPreds], solTrainY)
# 計算出若干統計量,其中包括MIC
names(micValues)
head(micValues$MIC)
bbb = cbind(micValues$MIC, rank(micValues$MIC))
cbind(aaa,bbb)
#####(2)輸入變量是分類變量輸出變量是數值型變量#####
# 查看數據集分類變量的類別數
get_levels = function(x)
{
out = levels(factor(x))
out
}
FP_levels = apply(solTrainXtrans[, fpCols],
MARGIN = 2,
FUN = get_levels)
FP_levels = as.data.frame(t(FP_levels))
#按照FP044分兩類,檢驗因變量均值是否相同
t.test(solTrainY ~ solTrainXtrans$FP044)
levels(factor(solTrainXtrans$FP002))
#分別按照FPxxx分兩類,檢驗因變量均值是否相同,並輸出t值和p值
getTstats = function(x, y)
{
tTest = t.test(y~x)
out = c(tStat = tTest$statistic, p = tTest$p.value)
out
}
tVals = apply(solTrainXtrans[, fpCols],
MARGIN = 2,
FUN = getTstats,
y = solTrainY)
## 轉置以方便查看
tVals1 = as.data.frame(t(tVals))
head(tVals1)
# 篩選不能拒絕原假設的分類變量
uselessFP = tVals1[tVals1$p>0.05,]