配對樣本檢驗及繪圖

1. 下載GEO數據

#=======================================================

#set the working files and load the packages
#=======================================================

# install some packages if neccessary
# if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
# BiocManager::install("limma")

setwd("D:\\SCIwork\\F41\\train")
library(GEOquery)
rm(list=ls())
library(dplyr)
library(tidyr)
library(Biobase)
library(limma)


#=======================================================


#=======================================================

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 10)
gsename = "GSE70768"
# 下載基因芯片數據,destdir參數指定下載到本地的地址
gse<- getGEO(gsename, destdir = ".")
##根據GSE號來下載數據,下載_series_matrix.txt.gz
gpl<- getGEO('GPL10558', destdir = ".")
##根據GPL號下載的是芯片設計的信息, soft文件
gse <- getGEO(filename = 'GSE70768_series_matrix.txt.gz')
gpl <- getGEO(filename = 'GPL10558.soft')
gpl <- gpl@dataTable@table
colnames(gpl)
gpl <- gpl %>%
  dplyr::select(ID, "Symbol")
write.csv(gpl,"GPL.csv", row.names = F)
# gse中的行名ID與gene name的對應關係
genename = read.csv("GPL.csv")

2. 註釋及得到基因表達矩陣


# 構建表達矩陣
exprSet <- as.data.frame(exprs(gse)) # 得到表達矩陣,行名爲ID,需要轉換
# 轉換ID爲gene name
exprSet$ID = rownames(exprSet)
express = merge( x=genename, y=exprSet, by="ID")
express$ID = NULL
express[1:5,1:5]
express[which(is.na(express),arr.ind = T)]<-0 #結合which進行缺失替代
exprSet <- aggregate(x = express[,2:ncol(express)],
                     by = list(express$Symbol),
                     FUN = median)
express[1:5,1:5]
exprSet <- as.data.frame(exprSet)
exprSet <-exprSet[-1,]
names(exprSet)[1] <- 'ID'
write.csv(exprSet, file="exprSet.csv")

##########################################################################################
##
###########################################################################################

colnames(exprSet)
exprSet[1:5,1:5]
exprSet <- exprSet[-1,]
gene1 <- c("ERBB2")
exprSet1 <- exprSet[which(exprSet$ID %in% gene1),]
rownames(exprSet1) <- exprSet1$ID
exprSet1$ID <- NULL
exprSet1 <- as.data.frame(t(exprSet1))
exprSet1$sample <- rownames(exprSet1)
> head(exprSet1)
              ERBB2     sample
GSM1817707 6.374360 GSM1817707
GSM1817708 6.434586 GSM1817708
GSM1817709 6.304334 GSM1817709
GSM1817710 6.286188 GSM1817710
GSM1817711 6.554135 GSM1817711
GSM1817712 6.451002 GSM1817712

3. 提取配對樣本數據


pd <- pData(gse)

dt <- subset(pd, select=c("geo_accession", "characteristics_ch1", 'title'))

dt$num <- 'num'

for (i in 1:dim(dt)[1]) {
  number <- as.numeric(nchar(dt$title[i]))-8
  dt$num[i] <- substr(x=dt$title[i], start = number, stop = number+8)
}

dt <- dt[-(1:13),]

df <- as.data.frame(table(dt$num))

df <- subset(df, df$Freq == 2)

dt <- dt[which(dt$num %in% df$Var1),]

dt$title <- NULL

names(dt)[2] <- 'group'
names(dt)[1] <- 'sample'

table(dt$group)

dt$group <- ifelse(dt$group == 'sample type: Tumour', 'Tumor', 'Normal')
table(dt$group)

dt <- merge(dt, exprSet1, by='sample')
> head(dt)
      sample group       num    ERBB2
1 GSM1817723 Tumor TB08.0341 6.572179
2 GSM1817724 Tumor TB08.0489 6.194203
3 GSM1817729 Tumor TB08.0598 6.188180
4 GSM1817730 Tumor TB08.0618 6.300513
5 GSM1817731 Tumor TB08.0667 6.279266
6 GSM1817737 Tumor TB08.0872 6.435920

5. 計算配對樣本T檢驗及wilcox檢驗的P值

dt_N <- subset(dt, group == "Normal")
dt_N <- dt_N$ERBB2

dt_T <- subset(dt, group == "Tumor")
dt_T <- dt_T$ERBB2

library(PairedData)
pd <- paired(dt_N, dt_T)

# 計算之前前後的差異
d <- with(dt, ERBB2[group == "Tumor"] - ERBB2[group == "Normal"])
#Shapiro-Wilk正態性檢驗差值是否符合正態分佈
shapiro.test(d) # p-value = 0.11

# 配對樣本t檢驗
res <- t.test(dt_N,dt_T, paired = TRUE)
# 顯示結果
res

# 配對樣本wilcox檢驗
res <- wilcox.test(dt_N,dt_T, paired = TRUE)
res

6. 繪圖



mean(dt_N)
#median(dt_N)
mean(dt_T)
#median(dt_T)

library(dplyr)
library(ggplot2)
library(ggpubr)
theme_set(theme_pubclean())


plot <- ggplot(data = dt, aes(x = group, y = ERBB2)) +
  geom_boxplot(fatten = NULL,aes(colour = group )) +
  scale_color_manual(values=c("#137F5F", "#ED553B"))+
  aes(colour = group)+
  stat_summary(fun = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
               width = 0.75, size = 1, linetype = "solid")+
  geom_point(aes(colour = factor(group)), size=1, alpha=0.5) +
  geom_line(aes(group=num), colour="gray50", linetype="11") +
  theme_classic()

print(plot)

pdf(file = 'pair.pdf', height = 4, width = 4)
print(plot)
dev.off()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章