文章目錄
分析流程概述
參考文獻:hppRNA—a Snakemake-based handy parameter-free pipeline for RNA-Seq analysis of numerous samples
六種主要流程示意圖:
下載測試數據
wget ftp://ftp.ccb.jhu.edu/pub/RNAseq_protocol/chrX_data.tar.gz
如果文章中的數據不是fastq格式,而是給GSExxxx,這是需要下載SRA數據
需要安裝sratoolkit,SRR_Acc_List.txt文件儲存SRR號碼
cat SRR_Acc_List.txt | while read id; do (prefetch ${id} &);done
# 批量轉換sra到fq格式
ls /public/project/RNA/airway/sra/* | while read id; do ( nohup fastq-dump --gzip --split-3 -O ./ ${id} & ); done
下載的數據:
[sunchengquan 15:45:09 /data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples]
$ ll
總用量 1.8G
-rwxr-xr-x 1 sunchengquan sunchengquan 88M 1月 15 2016 ERR188044_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 88M 1月 15 2016 ERR188044_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 84M 1月 15 2016 ERR188104_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 85M 1月 15 2016 ERR188104_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 108M 1月 15 2016 ERR188234_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 109M 1月 15 2016 ERR188234_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 59M 1月 15 2016 ERR188245_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 60M 1月 15 2016 ERR188245_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 65M 1月 15 2016 ERR188257_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 66M 1月 15 2016 ERR188257_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 39M 1月 15 2016 ERR188273_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 39M 1月 15 2016 ERR188273_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 88M 1月 15 2016 ERR188337_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 88M 1月 15 2016 ERR188337_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 63M 1月 15 2016 ERR188383_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 63M 1月 15 2016 ERR188383_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 86M 1月 15 2016 ERR188401_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 87M 1月 15 2016 ERR188401_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 56M 1月 15 2016 ERR188428_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 56M 1月 15 2016 ERR188428_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 70M 1月 15 2016 ERR188454_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 70M 1月 15 2016 ERR188454_chrX_2.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 75M 1月 15 2016 ERR204916_chrX_1.fastq.gz
-rwxr-xr-x 1 sunchengquan sunchengquan 75M 1月 15 2016 ERR204916_chrX_2.fastq.gz
數據質量控制
reads質量評估軟件:fastqc生成質控報告,multiqc將各個樣本的質控報告整合爲一個。
reads質量控制軟件:prinseq,cutadapt,trimmomatic,trim_galore
#!/usr/bin/env bash
set -e
settings(){
samples=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples
if test -w $samples;then
mkdir -p {$samples/qc,$samples/cleandata/qc}
else
echo "沒有寫入權限"
fi
}
thread(){
tmp_fifofile="/tmp/$$.fifo" #腳本運行的當前進程ID號作爲文件名
mkfifo "$tmp_fifofile"
exec 6<>"$tmp_fifofile" #將fd6指向fifo類型
rm $tmp_fifofile
thread_num=$1 # 此處定義線程數
for((i=0;i<$thread_num;i++));do
echo
done >&6 # 事實上就是在fd6中放置了$thread個回車符
$2 6 $3
exec 6>&- # 關閉df6
}
qc(){
source activate RNA
printf "[%s %s %s %s %s %s]::數據質量評估\n" $(echo `date`)
start=$(date +%s.%N)
list=$(find $2 -name *q\.gz)
file_num=`ls -l $2/qc|wc -l`
if [ $file_num -lt 2 ];then
for i in $list;do
read -u$1
{
name=`awk -v each=$i 'BEGIN{split(each,arr,"/");l=length(arr);print arr[l]}' `
fastqc $i -o $2/qc &>> $2/qc/qc.log
printf "[%s %s %s %s %s %s]::%s質量評估完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
multiqc -d $2/qc -o $2/qc
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::數據質量評估耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
trim_qc(){
printf "[%s %s %s %s %s %s]::數據質量控制\n" $(echo `date`)
source activate RNA
start=$(date +%s.%N)
dir=$samples/cleandata
find $samples -name *1?f*q?gz|sort >$dir/1
find $samples -name *2?f*q?gz|sort >$dir/2
paste -d ":" $dir/1 $dir/2 > $dir/config && rm $dir/1 $dir/2
file_num=`ls -l $dir|wc -l`
if [ $file_num -lt 3 ];then
for id in `cat $dir/config`;do
read -u$1
fq1=$(echo $id|cut -d":" -f1)
fq2=$(echo $id |cut -d":" -f2)
base_name=$(basename $fq1)
name=`awk -v each=$base_name 'BEGIN{split(each,arr,"_");print arr[1]}' `
{
trim_galore -q 25 --phred33 --length 25 --stringency 3 --paired -o $dir $fq1 $fq2 &> $dir/trim.log
printf "[%s %s %s %s %s %s]::%s質量控制完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::數據質量控制耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
settings
thread 6 qc $samples
thread 3 trim_qc
thread 6 qc $samples/cleandata
Tophat –> Cufflink –> Cuffdiff
流程圖:
手動安裝相關軟件
我們已經使用bioconda安裝相關的軟件,現在手動安裝一下,本流程所需要的軟件
下載並安裝比對軟件bowtie2
cd ~/local/app
curl -OL http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.4/bowtie2-2.2.4-linux-x86_64.zip
unzip bowtie2-2.2.4-linux-x86_64.zip
把比對軟件以及相關程序鏈接到bin文件夾
ln -s ~/local/app/bowtie2-2.2.4/bowtie2 ~/bin/
ln -s ~/local/app/bowtie2-2.2.4/bowtie2-align* ~/bin/
ln -s ~/local/app/bowtie2-2.2.4/bowtie2-build ~/bin/
安裝tophat2
cd ~/local/app/
curl -OL http://ccb.jhu.edu/software/tophat/downloads/tophat-2.1.1.Linux_x86_64.tar.gz
tar zxvf tophat-2.0.13.Linux_x86_64.tar.gz
cd ~/bin/
vi tophat
#!/usr/bin/env bash
python2 ~/local/app/tophat-2.1.1.Linux_x86_64/tophat $@
chmod 755 tophat
保存退出
#註釋文件
cd /data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genes
curl -L ftp://ftp.ensembl.org/pub/release-77/gtf/homo_sapiens/Homo_sapiens.GRCh38.77.gtf.gz > hg38.gtf.gz
gunzip *.gz
cat hg38.gtf | awk ' $1 =="X" { print $0 }' > chr_X.gtf
安裝cufflinks
cd ~/local/app
curl -OL http://cole-trapnell-lab.github.io/cufflinks/assets/downloads/cufflinks-2.1.1.Linux_x86_64.tar.gz
tar zxvf cufflinks-2.1.1.Linux_x86_64.tar.gz
ln -fs ~/local/app/cufflinks-2.1.1.Linux_x86_64/cufflinks ~/bin
ln -fs ~/local/app/cufflinks-2.1.1.Linux_x86_64/cuffdiff ~/bin
ln -fs ~/local/app/cufflinks-2.1.1.Linux_x86_64/gtf_to_sam ~/bin
ln -fs ~/local/app/cufflinks-2.1.1.Linux_x86_64/cuffcompare ~/bin
cd ~/bin
vi cuffmerge
#!/usr/bin/env bash
python2 ~/local/app/cufflinks-2.1.1.Linux_x86_64/cuffmerge $@
chmod 755 cuffmerge
流程代碼
#!/usr/bin/env bash
set -ue
settings(){
samples=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples
index=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/index
output=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/tophat+cuff
if test -w $(dirname $output) && test -w $(dirname index);then
mkdir -p {$index/bowtie,$output/1_tophat,$output/2_cufflinks,$output/3_cuffdiff}
fi
cuffdiff=$output/3_cuffdiff
indexes=$index/bowtie/chrX
genome=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/chrX.fa
gene=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genes/chrX.gtf
}
thread(){
tmp_fifofile="/tmp/$$.fifo"
mkfifo "$tmp_fifofile"
exec 6<>"$tmp_fifofile"
rm $tmp_fifofile
thread_num=$1
for((i=0;i<$thread_num;i++));do
echo
done >&6
$2 6
exec 6>&-
}
index(){
printf "[%s %s %s %s %s %s]::建立索引bowtie2-build\n" $(echo `date`)
start=$(date +%s.%N)
file_num=`ls -l $index/bowtie|wc -l`
source activate RNA
base_name=$(basename $genome)
name=`awk -v each=$base_name 'BEGIN{split(each,arr,".");print arr[1]}' `
if [ $file_num -lt 2 ];then
bowtie2-build -f $genome $index/bowtie/$name &> $index/bowtie/index.log
ln -s $genome $index/bowtie/$basename
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::建立索引耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
mapping(){
printf "[%s %s %s %s %s %s]::與參考基因組比對tophat\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/1_tophat
find $samples/cleandata -name *1?f*q.gz|sort > $dir/1
find $samples/cleandata -name *2?f*q.gz|sort > $dir/2
paste -d ":" $dir/1 $dir/2 > $dir/config && rm $dir/1 $dir/2
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $dir/config);do
fq1=$(echo $id|cut -d":" -f1)
fq2=$(echo $id |cut -d":" -f2)
name=`awk -v each=$(basename $fq1) 'BEGIN{split(each,arr,"_");print arr[1]}' `
read -u$1
{
tophat -p 8 -G $gene -o $dir/$name $indexes $fq1 $fq2 &>> $dir/mapping.log
printf "[%s %s %s %s %s %s]::%s比對完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::比對耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
assemble(){
printf "[%s %s %s %s %s %s]::轉錄本組裝和定量cufflinks\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_cufflinks
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $output/1_tophat/config);do
fq1=$(echo $id|cut -d":" -f1)
name=`awk -v each=$(basename $fq1) 'BEGIN{split(each,arr,"_");print arr[1]}' `
read -u$1
{
cufflinks -p 8 -g $gene -o $dir/$name $output/1_tophat/$name/accepted_hits.bam &> $dir/$name.log
printf "[%s %s %s %s %s %s]::%s轉錄本組裝完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::轉錄本組裝耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
merge(){
printf "[%s %s %s %s %s %s]::轉錄本合併cuffmerge\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_cufflinks
find $dir -name *transcripts?gtf|sort > $dir/assemblies.txt
source activate RNA
if [ ! -d $dir/merged_asm ];then
cuffmerge -p 8 -o $dir/merged_asm -g $gene -s $genome $dir/assemblies.txt &> $dir/cuffmerge.log
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::轉錄本合併耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
diff(){
printf "[%s %s %s %s %s %s]::差異分析cuffdiff\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/3_cuffdiff
S1=$output/1_tophat/ERR188245/accepted_hits.bam;S2=$output/1_tophat/ERR188428/accepted_hits.bam;S3=$output/1_tophat/ERR188337/accepted_hits.bam
S4=$output/1_tophat/ERR204916/accepted_hits.bam;S5=$output/1_tophat/ERR188234/accepted_hits.bam;S6=$output/1_tophat/ERR188273/accepted_hits.bam
S7=$output/1_tophat/ERR188401/accepted_hits.bam;S8=$output/1_tophat/ERR188257/accepted_hits.bam;S9=$output/1_tophat/ERR188383/accepted_hits.bam
S10=$output/1_tophat/ERR188454/accepted_hits.bam;S11=$output/1_tophat/ERR188104/accepted_hits.bam;S12=$output/1_tophat/ERR188044/accepted_hits.bam
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
cuffdiff -p 8 -b $genome -o $dir -L Female,Male -u $output/2_cufflinks/merged_asm/merged.gtf $S1,$S2,$S3,$S4,$S5,$S6 $S7,$S8,$S9,$S10,$S11,$S12 &> $dir/cuffdiff.log
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::差異分析cuffdiff耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
expression_matrix(){
dir=$output/3_cuffdiff
expr=$dir/gene_exp.diff
##篩選出下調的基因(log2_fold_change < -2 & pvalue < 0.001)
awk '{if(($10<-2)&&($11<0.001))print $3"\t"$8"\t"$9"\t"$10}' $dir/gene_exp.diff | grep -v 'inf' > $dir/down.txt
## 篩選出上調的基因(log2_fold_change > 2 & pvalue < 0.001
awk '{if(($10>2)&&($11<0.001))print $3"\t"$8"\t"$9"\t"$10}' $dir/gene_exp.diff | grep -v 'inf' > $dir/up.txt
}
settings
index
thread 4 mapping
thread 4 assemble
merge
diff
HISAT2 ->StringTie -> Ballgown
參考文獻:Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown
流程示意圖
流程代碼
#!/usr/bin/env bash
set -ue
settings(){
samples=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples
index=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/index
output=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/hisat+stringtie
if test -w $(dirname $output) && test -w $(dirname index);then
mkdir -p {$index/hisat2,$output/1_hisat,$output/2_stringtie,$output/3_ballgown,$output/4_DE}
fi
hisat=$output/1_hisat
stringtie=$output/2_stringtie
ballgown=$output/3_ballgown
genome=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/chrX.fa
gene=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genes/chrX.gtf
}
thread(){
tmp_fifofile="/tmp/$$.fifo"
mkfifo "$tmp_fifofile"
exec 6<>"$tmp_fifofile"
rm $tmp_fifofile
thread_num=$1
for((i=0;i<$thread_num;i++));do
echo
done >&6
$2 6
exec 6>&-
}
index(){
printf "[%s %s %s %s %s %s]::建立索引hisat2-build\n" $(echo `date`)
start=$(date +%s.%N)
file_num=`ls -l $index/hisat2|wc -l`
source activate RNA
base_name=$(basename $genome)
name=`awk -v each=$base_name 'BEGIN{split(each,arr,".");print arr[1]}' `
if [ $file_num -lt 2 ];then
hisat2_extract_exons.py $gene >$index/hisat2/$name'.exon'
hisat2_extract_splice_sites.py $gene >$index/hisat2/$name'.ss'
hisat2-build --ss $index/hisat2/$name'.ss' --exon $index/hisat2/$name'.exon' $genome $index/hisat2/$name 1>$index/hisat2/index.log 2>&1
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::建立索引耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
mapping(){
printf "[%s %s %s %s %s %s]::與參考基因組比對hisat2\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/1_hisat
find $samples/cleandata -name *1?f*q.gz|sort > $dir/1
find $samples/cleandata -name *2?f*q.gz|sort > $dir/2
paste -d ":" $dir/1 $dir/2 > $dir/config && rm $dir/1 $dir/2
file_num=`ls -l $dir|wc -l`
index_prefix=`awk -v each=$(basename $genome) 'BEGIN{split(each,arr,".");print arr[1]}' `
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $dir/config);do
fq1=$(echo $id|cut -d":" -f1)
fq2=$(echo $id |cut -d":" -f2)
name=`awk -v each=$(basename $fq1) 'BEGIN{split(each,arr,"_");print arr[1]}' `
read -u$1
{
hisat2 -p 8 --dta -x $index/hisat2/${index_prefix} -1 $fq1 -2 $fq2 -S $dir/${name}.sam &> $dir/${name}.hisat.log
samtools sort -@ 8 -o $dir/${name}.bam $dir/${name}.sam &> /dev/null && rm $dir/${name}.sam
printf "[%s %s %s %s %s %s]::%s比對完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::比對耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
assemble(){
printf "[%s %s %s %s %s %s]::轉錄本組裝stringtie\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_stringtie
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
for id in $output/1_hisat/*.bam;do
base_name=$(basename $id)
i=${base_name%.bam*}
read -u$1
{
stringtie -p 8 -G $gene -o $dir/${i}.gtf -l $i $output/1_hisat/${i}.bam &>> $dir/assemble.log
printf "[%s %s %s %s %s %s]::%s轉錄本組裝完成\n" $(echo `date`) $i
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::轉錄本組裝耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
merge(){
printf "[%s %s %s %s %s %s]::轉錄本合併stringtie --merge\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_stringtie
find $dir -name *?gtf|grep -v '.*merge.gtf'|sort > $dir/mergelist.txt
source activate RNA
if [ ! -f $dir/stringtie_merge.gtf ];then
stringtie --merge -p 8 -G $gene -o $dir/stringtie_merge.gtf $dir/mergelist.txt
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::轉錄本合併耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
count(){
printf "[%s %s %s %s %s %s]::轉錄本(基因)的定量:stringtie\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/3_ballgown
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $output/2_stringtie/mergelist.txt);do
base_name=$(basename $id)
name=${base_name%.gtf}
read -u$1
{
stringtie -B -p 8 -G $output/2_stringtie/stringtie_merge.gtf -o $dir/$name/$base_name $output/1_hisat/${name}.bam &>> $dir/count.log
printf "[%s %s %s %s %s %s]::%s轉錄本定量完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::轉錄本合併耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
settings
index
thread 4 mapping
thread 6 assemble
merge
thread 6 count
差異表達分析ballgown
設置工作目錄
就設置爲當前目錄下,新建一個DE目錄,根據需求調整
!pwd
!mkdir DE
/root/python_code_scq/python_bio_file
mkdir: 無法創建目錄"DE": 文件已存在
setwd("/root/python_code_scq/python_bio_file/DE")
suppressMessages(library(ballgown))
suppressMessages(library(genefilter))
suppressMessages(library(dplyr))
讀取樣本的表型數據
pheno_data=read.csv("/root/python_code_scq/python_bio_file/geuvadis_phenodata.csv")
pheno_data
ids | sex | population |
---|---|---|
ERR188044 | male | YRI |
ERR188104 | male | YRI |
ERR188234 | female | YRI |
ERR188245 | female | GBR |
ERR188257 | male | GBR |
ERR188273 | female | YRI |
ERR188337 | female | GBR |
ERR188383 | male | GBR |
ERR188401 | male | GBR |
ERR188428 | female | GBR |
ERR188454 | male | YRI |
ERR204916 | female | YRI |
讀取定量分析的表達數據
bg_chrX = ballgown(dataDir = "../3_ballgown",samplePattern = "ERR", pData = pheno_data )
Sat Dec 29 10:56:05 2018
Sat Dec 29 10:56:05 2018: Reading linking tables
Sat Dec 29 10:56:06 2018: Reading intron data files
Sat Dec 29 10:56:07 2018: Merging intron data
Sat Dec 29 10:56:07 2018: Reading exon data files
Sat Dec 29 10:56:09 2018: Merging exon data
Sat Dec 29 10:56:10 2018: Reading transcript data files
Sat Dec 29 10:56:10 2018: Merging transcript data
Wrapping up the results
Sat Dec 29 10:56:10 2018
bg_chrX
ballgown instance with 3354 transcripts and 12 samples
過濾掉表達量低的基因
bg_chrX_filt=subset(bg_chrX,"rowVars(texpr(bg_chrX))>1",genomesubset=TRUE)
對轉錄本進行差異分析
results_transcripts=stattest(bg_chrX_filt,feature="transcript",covariate="sex",adjustvars=c("population"),getFC=TRUE,meas="FPKM")
對基因進行差異分析
results_genes=stattest(bg_chrX_filt,feature="gene",covariate="sex",adjustvars=c("population"),getFC=TRUE,meas="FPKM")
給轉錄本添加基因名和基因ID
results_transcripts=data.frame(geneNames=ballgown::geneNames(bg_chrX_filt),geneIDs=geneIDs(bg_chrX_filt),results_transcripts)
根據p-value值對分析結果進行排序(從小到大)
results_transcripts=arrange(results_transcripts,pval)
results_genes=arrange(results_genes,pval)
write.csv(results_transcripts, "chrX_transcript_results.csv",row.names=FALSE)
write.csv(results_genes, "chrX_gene_results.csv",row.names=FALSE)
提取顯著差異表達的基因和轉錄本(q-value<0.05)
results_transcripts_0.05=subset(results_transcripts,results_transcripts$qval<0.05)
results_genes_0.05=subset(results_genes,results_genes$qval<0.05)
write.csv(results_transcripts_0.05,file="chrX_transcript_0.05.csv",row.names=FALSE)
write.csv(results_genes_0.05,file="chrX_genes_0.05.csv",row.names=FALSE)
數據可視化之顏色設定
tropical= c('darkorange', 'dodgerblue','hotpink', 'limegreen', 'yellow')
palette(tropical)
#當然rainbow()函數也可以完成這個任務
#palette(rainbow(5))
以FPKM爲參考值作圖,以性別作爲區分條件
options(repr.plot.width = 9, repr.plot.height = 6)
fpkm = texpr(bg_chrX,meas="FPKM")
#方便作圖將其log轉換,+1是爲了避免出現log2(0)的情況
fpkm = log2(fpkm+1)
# tiff(filename ="FPKM.tiff", compression = "lzw")
boxplot(fpkm,col=as.numeric(pheno_data$sex),las=2,ylab='log2(FPKM+1)')
# dev.off()
suppressMessages(library(reshape))
suppressMessages(library(ggplot2))
fpkm_L <- melt(fpkm)
tail(fpkm_L)
X1 | X2 | value | |
---|---|---|---|
40243 | 3349 | FPKM.ERR204916 | 0.00000000 |
40244 | 3350 | FPKM.ERR204916 | 3.18196839 |
40245 | 3351 | FPKM.ERR204916 | 0.04649665 |
40246 | 3352 | FPKM.ERR204916 | 0.00000000 |
40247 | 3353 | FPKM.ERR204916 | 0.00000000 |
40248 | 3354 | FPKM.ERR204916 | 0.00000000 |
colnames(fpkm_L)=c('n','sample','value')
head(fpkm_L)
n | sample | value |
---|---|---|
1 | FPKM.ERR188044 | 2.218154 |
2 | FPKM.ERR188044 | 0.000000 |
3 | FPKM.ERR188044 | 5.333466 |
4 | FPKM.ERR188044 | 0.000000 |
5 | FPKM.ERR188044 | 0.000000 |
6 | FPKM.ERR188044 | 8.949343 |
group_list=as.character(pheno_data$sex)
group_list
- 'male'
- 'male'
- 'female'
- 'female'
- 'male'
- 'female'
- 'female'
- 'male'
- 'male'
- 'female'
- 'male'
- 'female'
fpkm_L$group=rep(group_list,each=nrow(fpkm))
tail(fpkm_L)
n | sample | value | group | |
---|---|---|---|---|
40243 | 3349 | FPKM.ERR204916 | 0.00000000 | female |
40244 | 3350 | FPKM.ERR204916 | 3.18196839 | female |
40245 | 3351 | FPKM.ERR204916 | 0.04649665 | female |
40246 | 3352 | FPKM.ERR204916 | 0.00000000 | female |
40247 | 3353 | FPKM.ERR204916 | 0.00000000 | female |
40248 | 3354 | FPKM.ERR204916 | 0.00000000 | female |
options(repr.plot.width = 9, repr.plot.height = 5)
p=ggplot(fpkm_L,aes(x=sample,y=value,fill=group))+
geom_boxplot() +
theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1,size = 6))
p
#?graphics::boxplot
Subread -> featureCounts -> DESeq2
流程代碼
#!/usr/bin/env bash
set -ue
settings(){
samples=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/samples
index=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/index
output=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/subread+featurecounts
if test -w $(dirname $output) && test -w $(dirname index);then
mkdir -p {$index/subread,$output/1_subjunc,$output/2_featurecounts}
else
echo "沒有寫入權限"
fi
genome=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genome/chrX.fa
gene=/data/Data_base/test_tmp/RNA_seq_practice/chrX_data/genes/chr_X.gtf
}
thread(){
tmp_fifofile="/tmp/$$.fifo"
mkfifo "$tmp_fifofile"
exec 6<>"$tmp_fifofile"
rm $tmp_fifofile
thread_num=$1
for((i=0;i<$thread_num;i++));do
echo
done >&6
$2 6
exec 6>&-
}
index(){
printf "[%s %s %s %s %s %s]::建立索引subread-buildindex\n" $(echo `date`)
start=$(date +%s.%N)
file_num=`ls -l $index/subread|wc -l`
source activate RNA
base_name=$(basename $genome)
name=`awk -v each=$base_name 'BEGIN{split(each,arr,".");print arr[1]}' `
if [ $file_num -lt 2 ];then
subread-buildindex -o $index/subread/$name $genome &> $index/subread/index.log
fi
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::建立索引耗時%.2f分鐘\n" $(echo `date`) $dur
source deactivate RNA
}
mapping(){
printf "[%s %s %s %s %s %s]::與參考基因組比對subjunc\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/1_subjunc
find $samples/cleandata -name *1?f*q.gz|sort > $dir/1
find $samples/cleandata -name *2?f*q.gz|sort > $dir/2
paste -d ":" $dir/1 $dir/2 > $dir/config && rm $dir/1 $dir/2
file_num=`ls -l $dir|wc -l`
index_prefix=`awk -v each=$(basename $genome) 'BEGIN{split(each,arr,".");print arr[1]}' `
source activate RNA
if [ $file_num -lt 3 ];then
for id in $(cat $dir/config);do
fq1=$(echo $id|cut -d":" -f1)
fq2=$(echo $id |cut -d":" -f2)
name=`awk -v each=$(basename $fq1) 'BEGIN{split(each,arr,"_");print arr[1]}' `
read -u$1
{
subjunc -T 5 -i $index/subread/$index_prefix -r $fq1 -R $fq2 -o $dir/${name}.subjunc.bam &> $dir/${name}.log
printf "[%s %s %s %s %s %s]::%s比對完成\n" $(echo `date`) $name
echo >&$1
} &
done && wait
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::比對耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
count(){
printf "[%s %s %s %s %s %s]:: 轉錄本定量featureCounts\n" $(echo `date`)
start=$(date +%s.%N)
dir=$output/2_featurecounts
file_num=`ls -l $dir|wc -l`
source activate RNA
if [ $file_num -lt 3 ];then
featureCounts -T 5 -p -t exon -g gene_id -a $gene -o $dir/all.id.txt $output/1_subjunc/*.bam 1>$dir/counts.log 2>&1
# -g 默認gene_id, 可以選擇gene_name,後面可以用R將ID轉換成gene symbol
# 這樣得到的all.id.txt文件就是表達矩陣,但是,這個featureCounts有非常多的參數可以調整
dur=$(echo "($(date +%s.%N) - $start)/60" | bc -l)
printf "[%s %s %s %s %s %s]::轉錄本定量耗時%.2f分鐘\n" $(echo `date`) $dur
fi
source deactivate RNA
}
settings
index
thread 3 mapping
count
DESeq2差異分析
差異分析需要的包:
- limma
- edgeR
- gplots
- org.Mm.eg.db
- RColorBrewer
- Glimma
- DESeq2
DESeq2差異分析的三個主要數據:
- 表達矩陣(原始的count數據)
- 分組矩陣(實驗設計的分組情況)
- 差異比較矩陣(指定差異比較對象)
DESeq2差異分析的三個主要步驟:
- 構建
DESeqDataSet
(dds對象) DESeq()
函數進行差異分析results()
函數提取差異分析結果
構建
DESeqDataSet
對象有兩種方式,一種是通過DESeqDataSet()
函數將SummarizedExperiment
->DESeqDataSet
,如果我們只有表達矩陣和樣本表型數據,如何構建DESeqDataSet數據集,可以通過另一種方式,使用DESeqDataSetFromMatrix()
函數將表達矩陣和樣本表型數據轉化爲DESeqDataSet
數據對象。
colData
裏面的信息,就相當於我們自己準備的表型數據,建議是csv,tsv格式(comma-separated value;tab- separated value)文件,可以把樣本表型的數據賦值給colData
對象
suppressMessages(library(edgeR))
suppressMessages(library(limma))
suppressMessages(library(Glimma))
suppressMessages(library(gplots))
suppressMessages(library(DESeq2))
suppressMessages(library(org.Hs.eg.db))
suppressMessages(library(RColorBrewer))
讀取數據,提取表達矩陣
#讀取count table數據
data <- read.table("all.id.txt",stringsAsFactors = F, header = T)
meta = data[,1:6]
exprSet = data[,7:ncol(data)]
rownames(exprSet) <- data[,1]
colnames(exprSet)
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188044.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188104.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188234.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188245.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188257.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188273.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188337.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188383.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188401.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188428.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR188454.subjunc.bam'
- 'X.data.Data_base.test_tmp.RNA_seq_practice.chrX_data.subread.featurecounts.1_subjunc.ERR204916.subjunc.bam'
class(exprSet)
‘data.frame’
修改列名
# suppressMessages(library(dplyr))
# name <- lapply(strsplit(as.character(colnames(exprSet)),'[.]'),function(x){x[length(x)-2]}) %>% unlist
name <- unlist(lapply(strsplit(as.character(colnames(exprSet)),'[.]'),function(x){x[length(x)-2]}))
colnames(exprSet) <- name
colnames(exprSet)
- 'ERR188044'
- 'ERR188104'
- 'ERR188234'
- 'ERR188245'
- 'ERR188257'
- 'ERR188273'
- 'ERR188337'
- 'ERR188383'
- 'ERR188401'
- 'ERR188428'
- 'ERR188454'
- 'ERR204916'
head(exprSet,6)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
ENSG00000182378 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
ENSG00000178605 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
ENSG00000226179 | 13 | 13 | 9 | 4 | 1 | 2 | 6 | 3 | 8 | 4 | 7 | 10 |
ENSG00000167393 | 243 | 173 | 236 | 282 | 171 | 76 | 279 | 60 | 327 | 111 | 96 | 258 |
ENSG00000275287 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
exprSet[rownames(exprSet)=="ENSG00000205755",]
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000205755 | 9 | 10 | 5 | 3 | 4 | 2 | 8 | 4 | 6 | 3 | 2 | 5 |
查看數據維度,即幾行幾列
dim(exprSet)
- 2450
- 12
表型數據讀取,樣本分組信息
# Read the sample information into R
coldata <- read.csv("geuvadis_phenodata.csv")
coldata
ids | sex | population |
---|---|---|
ERR188044 | male | YRI |
ERR188104 | male | YRI |
ERR188234 | female | YRI |
ERR188245 | female | GBR |
ERR188257 | male | GBR |
ERR188273 | female | YRI |
ERR188337 | female | GBR |
ERR188383 | male | GBR |
ERR188401 | male | GBR |
ERR188428 | female | GBR |
ERR188454 | male | YRI |
ERR204916 | female | YRI |
coldata$group <- factor(paste0(coldata$sex, coldata$population))
coldata
ids | sex | population | group |
---|---|---|---|
ERR188044 | male | YRI | maleYRI |
ERR188104 | male | YRI | maleYRI |
ERR188234 | female | YRI | femaleYRI |
ERR188245 | female | GBR | femaleGBR |
ERR188257 | male | GBR | maleGBR |
ERR188273 | female | YRI | femaleYRI |
ERR188337 | female | GBR | femaleGBR |
ERR188383 | male | GBR | maleGBR |
ERR188401 | male | GBR | maleGBR |
ERR188428 | female | GBR | femaleGBR |
ERR188454 | male | YRI | maleYRI |
ERR204916 | female | YRI | femaleYRI |
suppressMessages(library(pheatmap))
suppressMessages(library(corrplot))
可視化樣本間的相似性
options(repr.plot.width = 5, repr.plot.height = 5)
# png('cor.png')
corrplot(cor(log2(exprSet+1)))
# dev.off()
m=cor(log2(exprSet+1))
#cor矩陣用scale歸一化
pheatmap(scale(cor(log2(exprSet+1))))
構建DESeqDataSet(dds)對象
#將讀取數據由data.frame轉換成matrix類型
countdata <- as.matrix(exprSet)
#將countdata轉換成DESeq2的數據格式(構建dds)
dds <- DESeqDataSetFromMatrix(countData = countdata,colData = coldata, design = ~ group )
dds
class: DESeqDataSet
dim: 2450 12
metadata(1): version
assays(1): counts
rownames(2450): ENSG00000228572 ENSG00000182378 ... ENSG00000276543
ENSG00000227159
rowData names(0):
colnames(12): ERR188044 ERR188104 ... ERR188454 ERR204916
colData names(4): ids sex population group
#將所有樣本中count值均爲0的基因所在行去除
dds <- dds[rowSums(counts(dds) > 0) != 0 ,] #dds[rowSums(assay(dds) > 0) != 0 , ]
dds
class: DESeqDataSet
dim: 1412 12
metadata(1): version
assays(1): counts
rownames(1412): ENSG00000228572 ENSG00000182378 ... ENSG00000182484
ENSG00000227159
rowData names(0):
colnames(12): ERR188044 ERR188104 ... ERR188454 ERR204916
colData names(4): ids sex population group
# DESeq分析,大小因子的估計,離差估計,負二項分佈的擬合以及計算相應的統計量
dds <- DESeq(dds, parallel = T) #parallel = T程序並行,提高速度
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 1 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 1 workers
plotDispEsts(dds, main="Dispersion plot")
resultsNames(dds)
- 'Intercept'
- 'group_femaleYRI_vs_femaleGBR'
- 'group_maleGBR_vs_femaleGBR'
- 'group_maleYRI_vs_femaleGBR'
使用rlog-transformed歸一化數據
rld <- rlog(dds)
countdata_rlog <- assay(rld)
head(assay(rld))
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | -1.716339 | -1.743507 | -1.743821 | -1.737338 | -1.739732 | -1.731162 | -1.716953 | -1.740015 | -1.744537 | -1.736857 | -1.713780 | -1.740743 |
ENSG00000182378 | 11.461408 | 10.989926 | 11.138604 | 10.963768 | 10.327285 | 11.675548 | 11.168207 | 10.169948 | 10.611627 | 10.418765 | 10.644815 | 11.174434 |
ENSG00000178605 | 10.007361 | 9.640540 | 10.095996 | 10.269333 | 10.421237 | 9.647242 | 10.331475 | 10.256303 | 10.378303 | 9.782669 | 10.204764 | 10.367591 |
ENSG00000226179 | 2.753014 | 2.747507 | 2.620197 | 2.538756 | 2.352026 | 2.503809 | 2.517179 | 2.450266 | 2.573627 | 2.546540 | 2.612662 | 2.723157 |
ENSG00000167393 | 7.540045 | 7.216756 | 7.476484 | 8.109476 | 7.458899 | 7.303154 | 7.627266 | 6.576983 | 7.734151 | 7.262772 | 6.894270 | 7.775687 |
ENSG00000275287 | -2.216337 | -2.207251 | -2.216517 | -2.214314 | -2.215128 | -2.212215 | -2.216546 | -2.215224 | -2.216760 | -2.214150 | -2.215468 | -2.215471 |
使用rlog-transformed 與 正常 區別
options(repr.plot.width = 9, repr.plot.height = 9)
par(cex = 0.7)
n.sample=ncol(countdata)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(countdata, col = cols,main="expression value",las=2)
boxplot(countdata_rlog, col = cols,main="expression value",las=2)
hist(countdata)
hist(countdata_rlog)
熱圖呈現樣品間的距離
mycols <- brewer.pal(8, "Dark2")[1:length(unique(coldata$group))]
sampleDists <- as.matrix(dist(t(countdata_rlog)))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[coldata$group], RowSideColors=mycols[coldata$group],
margin=c(10, 10), main="Sample Distance Matrix")
差異分析及結果提取
#提取性別間差異分析的結果
sex_Y_result <- results(dds, contrast = c("group","maleYRI","femaleYRI"), parallel = T)
#?results 查看用法
sex_G_result <- results(dds, contrast = c("group","maleGBR","femaleGBR"), parallel = T)
sex_G_result
log2 fold change (MLE): group maleGBR vs femaleGBR
Wald test p-value: group maleGBR vs femaleGBR
DataFrame with 1412 rows and 6 columns
baseMean log2FoldChange lfcSE
<numeric> <numeric> <numeric>
ENSG00000228572 0.212064658186015 -1.18134449838418 4.40721523394822
ENSG00000182378 2080.8358537599 -0.759383223494156 0.369671551206202
ENSG00000178605 1146.43865301777 0.288751709902595 0.319524288527945
ENSG00000226179 6.08058463520939 -0.493246072149153 0.820198203867892
ENSG00000167393 183.376192726023 -0.562850597829534 0.484234566315877
... ... ... ...
ENSG00000124334 60.2368766837945 0.489304618353469 0.581968571149684
ENSG00000270726 0.209832834505338 -0.22070381980218 4.40735632395141
ENSG00000185203 1.03705472979752 1.36980197787972 2.33437609518548
ENSG00000182484 493.281659491792 0.264865107532117 0.359069049586416
ENSG00000227159 3.18504610277688 -0.645544611222065 1.25294052192237
stat pvalue padj
<numeric> <numeric> <numeric>
ENSG00000228572 -0.268047834216135 0.788662499091892 0.992519295917494
ENSG00000182378 -2.05421061214032 0.0399553117393148 0.992519295917494
ENSG00000178605 0.903692521256771 0.366158465847581 0.992519295917494
ENSG00000226179 -0.601374240790949 0.547590751585147 0.992519295917494
ENSG00000167393 -1.16235113513638 0.245092863468186 0.992519295917494
... ... ... ...
ENSG00000124334 0.840774987877512 0.400474001859852 0.992519295917494
ENSG00000270726 -0.0500762369955846 0.960061636105264 0.992519295917494
ENSG00000185203 0.58679575270877 0.557340889682515 0.992519295917494
ENSG00000182484 0.737643937390858 0.460730848276013 0.992519295917494
ENSG00000227159 -0.515223667785613 0.606396732014109 0.992519295917494
結果說明
- 第一列基因名
- baseMean 經過矯正的reads count均值
- log2FoldChange 對差異倍數取以2爲底的對數
- lfcSE 差異倍數取對數後的標準差
- stat Wald統計量,由log2FoldChange除以標準差所得
- pvalue 原始的p值
- padj 校正後的p值
summary(sex_G_result)
out of 1412 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2, 0.14%
LFC < 0 (down) : 13, 0.92%
outliers [1] : 1, 0.071%
low counts [2] : 0, 0%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
#在羣體間差異分析的基因分析結果
population_F_result <- results(dds, contrast = c("group","femaleYRI","femaleGBR"), parallel = T)
population_M_result <- results(dds, contrast = c("group","maleYRI","maleGBR"), parallel = T)
#提取性別間差異表達的基因,並存入文件
sex_Y_result_0.01 <- sex_Y_result[which(sex_Y_result$pvalue < 0.01),]
write.csv(sex_Y_result_0.01, file="DE/sex_Y_result_0.01_DESeq2.csv")
#提取種羣間差異表達的基因,並存入文件
population_F_result_0.01 <- subset(population_F_result,pvalue < 0.01)
write.csv(population_F_result_0.01, file="DE/population_F_result_0.01_DESeq2.csv")
流程參考文獻RNA-Seq workflow: gene-level exploratory analysis and differential expression
edgeR差異分析
edgeR差異分析的三個主要數據:
- 表達矩陣(原始的count數據)
- 分組矩陣(實驗設計的分組情況)
- 差異比較矩陣(指定差異比較對象)
edgeR差異分析的五個主要步驟:
- DGElist()函數構造DGEList對象,轉化counts 成DGEList對象
- model.matrix()函數構建設計矩陣
- estimateDisp()函數進行差異分析
- glmQLFit()函數構建DGEGLM對象
- glmQLFTest()函數進行QL F-test
讀取數據,提取表達矩陣
上面已經做過了
表型數據讀取,樣本分組信息
上面已經做過了
#再轉換爲數據框
class(countdata)
head(countdata,3)
countdata <- as.data.frame(countdata)
dim(countdata)
class(countdata)
‘matrix’
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
ENSG00000182378 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
ENSG00000178605 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
- 2450
- 12
‘data.frame’
coldata
ids | sex | population | group |
---|---|---|---|
ERR188044 | male | YRI | maleYRI |
ERR188104 | male | YRI | maleYRI |
ERR188234 | female | YRI | femaleYRI |
ERR188245 | female | GBR | femaleGBR |
ERR188257 | male | GBR | maleGBR |
ERR188273 | female | YRI | femaleYRI |
ERR188337 | female | GBR | femaleGBR |
ERR188383 | male | GBR | maleGBR |
ERR188401 | male | GBR | maleGBR |
ERR188428 | female | GBR | femaleGBR |
ERR188454 | male | YRI | maleYRI |
ERR204916 | female | YRI | femaleYRI |
過濾表達量低的基因
在這個數據集中,我們選擇保留至少在兩個樣本中的CPM(counts-per-million)值大於0.5的基因(根據實際情況作調整)
使用edgeR
包中的cpm
函數
myCPM <- cpm(countdata)
# Have a look at the output
head(myCPM)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | 0.8414109 | 0.0000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.9116842 | 0.000000 | 0.0000 | 0.00000 | 1.058151 | 0.00000 |
ENSG00000182378 | 3473.3441034 | 2263.3743096 | 2255.726656 | 2125.22542 | 1185.821950 | 3899.517816 | 2963.8854523 | 976.066093 | 1629.1449 | 1167.61147 | 1605.214567 | 2623.31454 |
ENSG00000178605 | 1039.1424340 | 730.5070523 | 1010.692058 | 1348.62557 | 1692.180876 | 649.603988 | 1633.7381515 | 1404.362993 | 1648.4198 | 788.82113 | 1320.572036 | 1489.62082 |
ENSG00000226179 | 10.9383414 | 11.1201308 | 6.469579 | 5.21208 | 1.177579 | 3.787778 | 5.4701054 | 3.381291 | 6.7043 | 5.20674 | 7.407055 | 10.14037 |
ENSG00000167393 | 204.4628433 | 147.9832787 | 169.646747 | 367.45161 | 201.365992 | 143.935577 | 254.3599019 | 67.625826 | 274.0383 | 144.48704 | 101.582464 | 261.62163 |
ENSG00000275287 | 0.0000000 | 0.8553947 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0000000 | 0.000000 | 0.0000 | 0.00000 | 0.000000 | 0.00000 |
# Which values in myCPM are greater than 0.5?
thresh <- myCPM > 0.5
# This produces a logical matrix with TRUEs and FALSEs
head(thresh)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 | TRUE | FALSE | FALSE | FALSE | FALSE | FALSE | TRUE | FALSE | FALSE | FALSE | TRUE | FALSE |
ENSG00000182378 | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE |
ENSG00000178605 | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE |
ENSG00000226179 | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE |
ENSG00000167393 | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE | TRUE |
ENSG00000275287 | FALSE | TRUE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE |
# Summary of how many TRUEs there are in each row
# There are 672 genes that have TRUEs in all 12 samples.
table(rowSums(thresh))
0 1 2 3 4 5 6 7 8 9 10 11 12
1038 208 106 79 58 45 40 44 28 34 40 58 672
# we would like to keep genes that have at least 2 TRUES in each row of thresh
keep <- rowSums(thresh) >= 2
# Subset the rows of countdata to keep the more highly expressed genes
counts.keep <- countdata[keep,]
summary(keep)
Mode FALSE TRUE
logical 1246 1204
dim(counts.keep)
- 1204
- 12
ID轉換,基因註釋
# rm(list=ls())
ls('package:org.Hs.eg.db')
- 'org.Hs.eg'
- 'org.Hs.eg_dbconn'
- 'org.Hs.eg_dbfile'
- 'org.Hs.eg_dbInfo'
- 'org.Hs.eg_dbschema'
- 'org.Hs.eg.db'
- 'org.Hs.egACCNUM'
- 'org.Hs.egACCNUM2EG'
- 'org.Hs.egALIAS2EG'
- 'org.Hs.egCHR'
- 'org.Hs.egCHRLENGTHS'
- 'org.Hs.egCHRLOC'
- 'org.Hs.egCHRLOCEND'
- 'org.Hs.egENSEMBL'
- 'org.Hs.egENSEMBL2EG'
- 'org.Hs.egENSEMBLPROT'
- 'org.Hs.egENSEMBLPROT2EG'
- 'org.Hs.egENSEMBLTRANS'
- 'org.Hs.egENSEMBLTRANS2EG'
- 'org.Hs.egENZYME'
- 'org.Hs.egENZYME2EG'
- 'org.Hs.egGENENAME'
- 'org.Hs.egGO'
- 'org.Hs.egGO2ALLEGS'
- 'org.Hs.egGO2EG'
- 'org.Hs.egMAP'
- 'org.Hs.egMAP2EG'
- 'org.Hs.egMAPCOUNTS'
- 'org.Hs.egOMIM'
- 'org.Hs.egOMIM2EG'
- 'org.Hs.egORGANISM'
- 'org.Hs.egPATH'
- 'org.Hs.egPATH2EG'
- 'org.Hs.egPFAM'
- 'org.Hs.egPMID'
- 'org.Hs.egPMID2EG'
- 'org.Hs.egPROSITE'
- 'org.Hs.egREFSEQ'
- 'org.Hs.egREFSEQ2EG'
- 'org.Hs.egSYMBOL'
- 'org.Hs.egSYMBOL2EG'
- 'org.Hs.egUCSCKG'
- 'org.Hs.egUNIGENE'
- 'org.Hs.egUNIGENE2EG'
- 'org.Hs.egUNIPROT'
entrezID2ensemblID <- toTable(org.Hs.egENSEMBL)
head(entrezID2ensemblID)
gene_id | ensembl_id |
---|---|
1 | ENSG00000121410 |
2 | ENSG00000175899 |
3 | ENSG00000256069 |
9 | ENSG00000171428 |
10 | ENSG00000156006 |
12 | ENSG00000196136 |
entrezID2symbol <- toTable(org.Hs.egSYMBOL)
head(entrezID2symbol)
gene_id | symbol |
---|---|
1 | A1BG |
2 | A2M |
3 | A2MP1 |
9 | NAT1 |
10 | NAT2 |
11 | NATP |
columns(org.Hs.eg.db)
- 'ACCNUM'
- 'ALIAS'
- 'ENSEMBL'
- 'ENSEMBLPROT'
- 'ENSEMBLTRANS'
- 'ENTREZID'
- 'ENZYME'
- 'EVIDENCE'
- 'EVIDENCEALL'
- 'GENENAME'
- 'GO'
- 'GOALL'
- 'IPI'
- 'MAP'
- 'OMIM'
- 'ONTOLOGY'
- 'ONTOLOGYALL'
- 'PATH'
- 'PFAM'
- 'PMID'
- 'PROSITE'
- 'REFSEQ'
- 'SYMBOL'
- 'UCSCKG'
- 'UNIGENE'
- 'UNIPROT'
help('select')
ann <- select(org.Hs.eg.db,keys = rownames(counts.keep), keytype="ENSEMBL",columns = c("ENSEMBL","SYMBOL","GENENAME"))
'select()' returned 1:many mapping between keys and columns
head(ann)
dim(ann)
ENSEMBL | SYMBOL | GENENAME |
---|---|---|
ENSG00000228572 | NA | NA |
ENSG00000182378 | PLCXD1 | phosphatidylinositol specific phospholipase C X domain containing 1 |
ENSG00000178605 | GTPBP6 | GTP binding protein 6 (putative) |
ENSG00000226179 | NA | NA |
ENSG00000167393 | PPP2R3B | protein phosphatase 2 regulatory subunit B''beta |
ENSG00000237531 | NA | NA |
- 1213
- 3
symbol <- ann[match(rownames(counts.keep),ann$ENSEMBL),2]
rownames(counts.keep) <- paste(rownames(counts.keep) ,':', symbol)
head(counts.keep)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000228572 : NA | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
ENSG00000182378 : PLCXD1 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
ENSG00000178605 : GTPBP6 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
ENSG00000226179 : NA | 13 | 13 | 9 | 4 | 1 | 2 | 6 | 3 | 8 | 4 | 7 | 10 |
ENSG00000167393 : PPP2R3B | 243 | 173 | 236 | 282 | 171 | 76 | 279 | 60 | 327 | 111 | 96 | 258 |
ENSG00000237531 : NA | 0 | 1 | 4 | 0 | 0 | 0 | 4 | 0 | 0 | 6 | 2 | 0 |
counts.keep <- counts.keep[grep(".*: NA$",rownames(counts.keep),invert = T), ]
head(counts.keep)
dim(counts.keep)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000182378 : PLCXD1 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
ENSG00000178605 : GTPBP6 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
ENSG00000167393 : PPP2R3B | 243 | 173 | 236 | 282 | 171 | 76 | 279 | 60 | 327 | 111 | 96 | 258 |
ENSG00000205755 : CRLF2 | 9 | 10 | 5 | 3 | 4 | 2 | 8 | 4 | 6 | 3 | 2 | 5 |
ENSG00000198223 : CSF2RA | 7 | 2 | 3 | 1 | 1 | 2 | 1 | 6 | 2 | 0 | 0 | 7 |
ENSG00000185291 : IL3RA | 651 | 664 | 195 | 202 | 455 | 216 | 283 | 413 | 693 | 221 | 593 | 620 |
- 697
- 12
構建DGEList對象(edgeR的數據分析格式)
edgeR_data <- DGEList(counts = counts.keep, group = coldata$group)
# See what slots are stored in edgeR_data
names(edgeR_data)
head(edgeR_data$counts)
- 'counts'
- 'samples'
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000182378 : PLCXD1 | 4128 | 2646 | 3138 | 1631 | 1007 | 2059 | 3251 | 866 | 1944 | 897 | 1517 | 2587 |
ENSG00000178605 : GTPBP6 | 1235 | 854 | 1406 | 1035 | 1437 | 343 | 1792 | 1246 | 1967 | 606 | 1248 | 1469 |
ENSG00000167393 : PPP2R3B | 243 | 173 | 236 | 282 | 171 | 76 | 279 | 60 | 327 | 111 | 96 | 258 |
ENSG00000205755 : CRLF2 | 9 | 10 | 5 | 3 | 4 | 2 | 8 | 4 | 6 | 3 | 2 | 5 |
ENSG00000198223 : CSF2RA | 7 | 2 | 3 | 1 | 1 | 2 | 1 | 6 | 2 | 0 | 0 | 7 |
ENSG00000185291 : IL3RA | 651 | 664 | 195 | 202 | 455 | 216 | 283 | 413 | 693 | 221 | 593 | 620 |
# Library size information is stored in the samples slot
edgeR_data$samples
group | lib.size | norm.factors | |
---|---|---|---|
ERR188044 | maleYRI | 1062212 | 1 |
ERR188104 | maleYRI | 1037630 | 1 |
ERR188234 | femaleYRI | 1236643 | 1 |
ERR188245 | femaleGBR | 688237 | 1 |
ERR188257 | maleGBR | 769281 | 1 |
ERR188273 | femaleYRI | 476704 | 1 |
ERR188337 | femaleGBR | 1009898 | 1 |
ERR188383 | maleGBR | 786937 | 1 |
ERR188401 | maleGBR | 1087875 | 1 |
ERR188428 | femaleGBR | 697609 | 1 |
ERR188454 | maleYRI | 836018 | 1 |
ERR204916 | femaleYRI | 876941 | 1 |
可以自己手動計算一下,看一致否
apply(counts.keep,2,sum)
- ERR188044
- 1062212
- ERR188104
- 1037630
- ERR188234
- 1236643
- ERR188245
- 688237
- ERR188257
- 769281
- ERR188273
- 476704
- ERR188337
- 1009898
- ERR188383
- 786937
- ERR188401
- 1087875
- ERR188428
- 697609
- ERR188454
- 836018
- ERR204916
- 876941
質控
文庫大小及其分佈圖
edgeR_data$samples$lib.size
- 1062212
- 1037630
- 1236643
- 688237
- 769281
- 476704
- 1009898
- 786937
- 1087875
- 697609
- 836018
- 876941
# The names argument tells the barplot to use the sample names on the x-axis
# The las argument rotates the axis names
options(repr.plot.width = 6, repr.plot.height = 4)
barplot(edgeR_data$samples$lib.size,names=colnames(edgeR_data),las=2)
title("Barplot of library sizes")
# Get log2 counts per million
logcounts <- cpm(edgeR_data,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2)
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
title("Boxplots of logCPMs (unnormalised)")
Multidimensional scaling plots
A principle components analysis is an example of an unsupervised analysis, where we don’t need to specify the groups.
plotMDS(edgeR_data)
按組添加顏色
# We specify the option to let us plot two plots side-by-sde
par(mfrow=c(1,2))
options(repr.plot.width = 9, repr.plot.height = 5)
# Let's set up colour schemes for sex
levels(coldata$sex)
## Let's choose purple for basal and orange for luminal
col.sex <- c("purple","orange")[coldata$sex]
data.frame(coldata$sex,col.sex)
# Redo the MDS with cell type colouring
plotMDS(edgeR_data,col=col.sex)
# Let's add a legend to the plot so we know which colours correspond to which sex
legend("topleft",fill=c("purple","orange"),legend=levels(coldata$sex))
# Add a title
title("sex")
# Similarly for population
levels(coldata$population)
col.population <- c("blue","red")[coldata$population]
plotMDS(edgeR_data,col=col.population)
legend("topleft",fill=c("blue","red"),legend=levels(coldata$population),cex=0.8)
title("population")
- 'female'
- 'male'
coldata.sex | col.sex |
---|---|
male | orange |
male | orange |
female | purple |
female | purple |
male | orange |
female | purple |
female | purple |
male | orange |
male | orange |
female | purple |
male | orange |
female | purple |
- 'GBR'
- 'YRI'
整合成一張圖
cols <- c("#1B9E77", "#D95F02")
col.population <- c("#1B9E77", "#D95F02")[coldata$population]
char.sex <- c(1,4)[coldata$sex]
char.sex
- 4
- 4
- 1
- 1
- 4
- 1
- 1
- 4
- 4
- 1
- 4
- 1
options(repr.plot.width = 6, repr.plot.height = 5)
plotMDS(edgeR_data,dim=c(1,2),col=col.population,pch=char.sex,cex=2)
legend("top",legend=levels(coldata$population),col=cols,pch=16)
legend("right",legend=levels(coldata$sex),pch=c(1,4))
標準化,歸一化
Normalisation for composition bias
# Apply normalisation to DGEList object
edgeR_data <- calcNormFactors(edgeR_data)
This will update the normalisation factors in the DGEList object (their default values are 1). Take a look at the normalisation factors for these samples.
edgeR_data$samples
group | lib.size | norm.factors | |
---|---|---|---|
ERR188044 | maleYRI | 1062212 | 1.0091479 |
ERR188104 | maleYRI | 1037630 | 1.0063310 |
ERR188234 | femaleYRI | 1236643 | 0.8810990 |
ERR188245 | femaleGBR | 688237 | 0.9781766 |
ERR188257 | maleGBR | 769281 | 1.0374696 |
ERR188273 | femaleYRI | 476704 | 0.9219804 |
ERR188337 | femaleGBR | 1009898 | 1.0665011 |
ERR188383 | maleGBR | 786937 | 1.0556198 |
ERR188401 | maleGBR | 1087875 | 1.0580396 |
ERR188428 | femaleGBR | 697609 | 0.9437640 |
ERR188454 | maleYRI | 836018 | 1.0681350 |
ERR204916 | femaleYRI | 876941 | 0.9947293 |
ERR188234樣本的normalisation factors
最小,ERR188337樣本的normalisation factors
最大,
If we plot mean difference plots using the plotMD function for these samples, we should be able to see the composition bias problem. We will use the logcounts, which have been normalised for library size, but not for composition bias.
options(repr.plot.width = 9, repr.plot.height = 5)
par(mfrow=c(1,2))
plotMD(logcounts,column = 3)
abline(h=0,col="grey")
plotMD(logcounts,column = 7)
abline(h=0,col="grey")
The mean-difference plots show average expression (mean: x-axis) against log-fold-changes (difference: y-axis). Because our DGEList object contains the normalisation factors, if we redo these plots using edgeR_data
, we should see the composition bias problem has been solved.
par(mfrow=c(1,2))
plotMD(edgeR_data,column = 3)
abline(h=0,col="grey")
plotMD(edgeR_data,column = 7)
abline(h=0,col="grey")
熱圖層次聚類
# We estimate the variance for each row in the logcounts matrix
var_genes <- apply(logcounts, 1, var)
head(var_genes)
- ENSG00000182378 : PLCXD1
- 0.411283212250483
- ENSG00000178605 : GTPBP6
- 0.233040789399784
- ENSG00000167393 : PPP2R3B
- 0.440930844051291
- ENSG00000205755 : CRLF2
- 0.267770197328411
- ENSG00000198223 : CSF2RA
- 2.65972252131491
- ENSG00000185291 : IL3RA
- 0.468356036950437
# Get the gene names for the top 50 most variable genes
select_var <- names(sort(var_genes, decreasing=TRUE))[1:50]
head(select_var)
- 'ENSG00000229807 : XIST'
- 'ENSG00000198759 : EGFL6'
- 'ENSG00000146938 : NLGN4X'
- 'ENSG00000269096 : CT45A3'
- 'ENSG00000175556 : LONRF3'
- 'ENSG00000003096 : KLHL13'
# Subset logcounts matrix
highly_variable_lcpm <- logcounts[select_var,]
dim(highly_variable_lcpm)
- 50
- 12
options(repr.plot.width = 9, repr.plot.height = 6)
my_heatmap <- function(countdata,col_label,row_lable,group){
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)
col.sex <- c("purple","orange")[group]
# Plot the heatmap
heatmap.2(countdata,labCol=col_label,srtCol=15, labRow=row_lable,
col=rev(morecols(50)),trace="none", main="Top 50 most variable genes across samples",
ColSideColors=col.sex,colCol=col.sex,scale="row")
}
col_label <- paste(coldata$sex ,'.', coldata$population)
row_lable <- unlist(lapply(strsplit(rownames(highly_variable_lcpm),":"),function(x) x[2]))
group <- coldata$sex
my_heatmap(highly_variable_lcpm, col_label, row_lable, group)
建立設計矩陣
design <- model.matrix(~0 + group , data = coldata)
design
groupfemaleGBR | groupfemaleYRI | groupmaleGBR | groupmaleYRI | |
---|---|---|---|---|
1 | 0 | 0 | 0 | 1 |
2 | 0 | 0 | 0 | 1 |
3 | 0 | 1 | 0 | 0 |
4 | 1 | 0 | 0 | 0 |
5 | 0 | 0 | 1 | 0 |
6 | 0 | 1 | 0 | 0 |
7 | 1 | 0 | 0 | 0 |
8 | 0 | 0 | 1 | 0 |
9 | 0 | 0 | 1 | 0 |
10 | 1 | 0 | 0 | 0 |
11 | 0 | 0 | 0 | 1 |
12 | 0 | 1 | 0 | 0 |
#修改設計矩陣的列名,行名
colnames(design)
- 'groupfemaleGBR'
- 'groupfemaleYRI'
- 'groupmaleGBR'
- 'groupmaleYRI'
levels(coldata$group)
- 'femaleGBR'
- 'femaleYRI'
- 'maleGBR'
- 'maleYRI'
colnames(design) <- levels(coldata$group)
colnames(design)
rownames(design) <- coldata$id
rownames(design)
design
- 'femaleGBR'
- 'femaleYRI'
- 'maleGBR'
- 'maleYRI'
- 'ERR188044'
- 'ERR188104'
- 'ERR188234'
- 'ERR188245'
- 'ERR188257'
- 'ERR188273'
- 'ERR188337'
- 'ERR188383'
- 'ERR188401'
- 'ERR188428'
- 'ERR188454'
- 'ERR204916'
femaleGBR | femaleYRI | maleGBR | maleYRI | |
---|---|---|---|---|
ERR188044 | 0 | 0 | 0 | 1 |
ERR188104 | 0 | 0 | 0 | 1 |
ERR188234 | 0 | 1 | 0 | 0 |
ERR188245 | 1 | 0 | 0 | 0 |
ERR188257 | 0 | 0 | 1 | 0 |
ERR188273 | 0 | 1 | 0 | 0 |
ERR188337 | 1 | 0 | 0 | 0 |
ERR188383 | 0 | 0 | 1 | 0 |
ERR188401 | 0 | 0 | 1 | 0 |
ERR188428 | 1 | 0 | 0 | 0 |
ERR188454 | 0 | 0 | 0 | 1 |
ERR204916 | 0 | 1 | 0 | 0 |
估計離散度
edgeR_data <- estimateDisp(edgeR_data, design)
names(edgeR_data)
- 'counts'
- 'samples'
- 'design'
- 'common.dispersion'
- 'trended.dispersion'
- 'tagwise.dispersion'
- 'AveLogCPM'
- 'trend.method'
- 'prior.df'
- 'prior.n'
- 'span'
構建DGEGLM對象
fit <- glmQLFit(edgeR_data, design)
建立對照
?makeContrasts
my.contrasts <- makeContrasts(YRI.f.vs.m = femaleYRI - maleYRI, GBR.f.vs.m = femaleGBR - maleGBR, f.YRI.vs.GBR = femaleYRI - femaleGBR, m.YRI.vs.GBR = maleYRI - maleGBR, levels = design)
差異分析(用quasi-likelihood(QL) F-test)
qlf_YRI.f.vs.m <- glmQLFTest(fit, contrast = my.contrasts[, "YRI.f.vs.m"])
qlf_GBR.f.vs.m <- glmQLFTest(fit, contrast = my.contrasts[, "GBR.f.vs.m"])
qlf_f.YRI.vs.GBR <- glmQLFTest(fit, contrast = my.contrasts[, "f.YRI.vs.GBR"])
qlf_m.YRI.vs.GBR <- glmQLFTest(fit, contrast = my.contrasts[, "m.YRI.vs.GBR"])
names(qlf_m.YRI.vs.GBR)
- 'coefficients'
- 'fitted.values'
- 'deviance'
- 'method'
- 'unshrunk.coefficients'
- 'df.residual'
- 'design'
- 'offset'
- 'dispersion'
- 'prior.count'
- 'AveLogCPM'
- 'df.residual.zeros'
- 'df.prior'
- 'var.post'
- 'var.prior'
- 'samples'
- 'table'
- 'comparison'
- 'df.test'
- 'df.total'
對每個差異分析分析結果的pvalue進行校正
YRI.f.vs.m_padjust <- p.adjust(qlf_YRI.f.vs.m$table$PValue, method = "BH")
GBR.f.vs.m_padjust <- p.adjust(qlf_GBR.f.vs.m$table$PValue, method = "BH")
f.YRI.vs.GBR_padjust <- p.adjust(qlf_f.YRI.vs.GBR$table$PValue, method = "BH")
m.YRI.vs.GBR_padjust <- p.adjust(qlf_m.YRI.vs.GBR$table$PValue, method = "BH")
提取差異表達的基因
YRI.f.vs.m.sig <- qlf_YRI.f.vs.m$table[which(YRI.f.vs.m_padjust < 0.05),]
write.csv(YRI.f.vs.m.sig, file = "DE/YRI.f.vs.m_0.05_edgeR.csv")
GBR.f.vs.m.sig <- qlf_GBR.f.vs.m$table[which(qlf_GBR.f.vs.m$table$PValue < 0.05),]
write.csv(GBR.f.vs.m.sig, file = "DE/GBR.f.vs.m_0.05_edgeR.csv")
write.csv(qlf_GBR.f.vs.m$table, file = "DE/GBR.f.vs.m_edgeR.csv")
結果可視化
火山圖
write.csv(counts.keep, file = "DE/counts.keep.csv")
DE <- read.csv('DE/GBR.f.vs.m_edgeR.csv',row.names = 1)
tail(DE)
logFC | logCPM | F | PValue | |
---|---|---|---|---|
ENSG00000224533 : TMLHE-AS1 | -0.056469327 | 2.052160 | 4.307370e-03 | 0.9486785 |
ENSG00000185973 : TMLHE | 0.221289532 | 8.365665 | 1.286092e+00 | 0.2774115 |
ENSG00000168939 : SPRY3 | -0.002682364 | 4.420135 | 3.410488e-05 | 0.9954298 |
ENSG00000124333 : VAMP7 | -0.137382838 | 10.762397 | 9.979491e-01 | 0.3361840 |
ENSG00000124334 : IL9R | -0.488450923 | 6.182421 | 7.899469e-01 | 0.3903941 |
ENSG00000185203 : WASIR1 | -1.144990880 | 1.771968 | 5.055304e-01 | 0.4897413 |
DEG <- data.frame(DE$PValue,DE$logFC)
colnames(DEG) <- c('p','logFC')
rownames(DEG) <- rownames(DE)
tail(DEG)
p | logFC | |
---|---|---|
ENSG00000224533 : TMLHE-AS1 | 0.9486785 | -0.056469327 |
ENSG00000185973 : TMLHE | 0.2774115 | 0.221289532 |
ENSG00000168939 : SPRY3 | 0.9954298 | -0.002682364 |
ENSG00000124333 : VAMP7 | 0.3361840 | -0.137382838 |
ENSG00000124334 : IL9R | 0.3903941 | -0.488450923 |
ENSG00000185203 : WASIR1 | 0.4897413 | -1.144990880 |
示例數據量少,篩選出的差異基因也少,畫圖的效果不太好
options(repr.plot.width = 8, repr.plot.height = 6)
my_volcano <- function(DEG, t_p, t_FC = 0, t_marker){
## example: print(my_volcano(a[,c(5,1)], 0.01, 0.6, 0))
library(ggplot2)
DEG = na.omit(DEG)
colnames(DEG) = c('p','logFC')
DEG$gene <- unlist(lapply(strsplit(rownames(DEG),":"),function(x) x[2]))
DEG[DEG$p < 1e-10, 'p'] = 1e-10
if (t_FC == 0){
t_FC <- with(DEG, mean(abs(logFC)) + 2 * sd(abs(logFC)))
}
DEG$change = as.factor(ifelse(DEG$p < t_p & abs(DEG$logFC) > t_FC,ifelse(DEG$logFC > t_FC, "UP", "DOWN"),"NOT"))
this_title <- paste0("Cutoff for logFC is ", round(t_FC, 3),
"\nThe number of up gene is ",
nrow(DEG[DEG$change == "UP", ]),
"\nThe number of down gene is ",
nrow(DEG[DEG$change == "DOWN", ])
)
p = ggplot(data = DEG, aes(x = logFC, y = -log10(p), color = change)) +
geom_point() +
scale_color_manual(values = c('blue', 'black', 'red')) +
geom_hline(yintercept = -log10(t_p), lty = 4, lwd = 0.6, alpha = 0.8) +
geom_vline(xintercept = c(t_FC, -t_FC), lty = 4, lwd = 0.6, alpha = 0.8) +
theme_bw() +
ylim(0,5) +
xlim(-10,10) +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")) + ggtitle(this_title) +
labs(x = "log2(fold change)", y = "-log10(p-value)") +
theme(plot.title = element_text(hjust = 0.5))
if (t_marker != 0) {
DE <- subset(DEG, abs(logFC) > t_marker)
p = p + geom_text(data = DE , aes(label = gene), col = "green",alpha = 0.5)
}
return(p)
}
print(my_volcano(DEG, 0.05, 1, 4.2))
Warning message:
“Removed 1 rows containing missing values (geom_point).”Warning message:
“Removed 1 rows containing missing values (geom_text).”
熱圖
suppressMessages(library(pheatmap))
suppressMessages(library(gplots))
suppressMessages(library(ComplexHeatmap))
DE_0.05 <- read.csv('DE/GBR.f.vs.m_0.05_edgeR.csv',row.names = 1)
coldata <- read.csv("geuvadis_phenodata.csv")
counts.keep <- read.csv("DE/counts.keep.csv",row.names = 1)
choose_gene=head(rownames(DE_0.05),50)
choose_matrix=counts.keep[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
head(choose_matrix)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000182378 : PLCXD1 | 1.9215927 | 0.4896378 | 0.9650236 | -0.4910871 | -1.0940155 | -0.07754007 | 1.0742078 | -1.2302541 | -0.1886567 | -1.2003010 | -0.60123750 | 0.4326301 |
ENSG00000185291 : IL3RA | 1.0684102 | 1.1323672 | -1.1750052 | -1.1405668 | 0.1041351 | -1.07169002 | -0.7420654 | -0.1024952 | 1.2750405 | -1.0470912 | 0.78306348 | 0.9158973 |
ENSG00000196433 : ASMT | 2.1365053 | 0.6224464 | 0.5215092 | -0.6897379 | -0.1850516 | -0.89161243 | -0.8916124 | 1.5308817 | -0.3869261 | -0.7906752 | -0.68973792 | -0.2859889 |
ENSG00000183943 : PRKX | -0.2822280 | -0.1640627 | 1.8727853 | -0.8229841 | -1.3016534 | -1.46788583 | 1.3420432 | -0.2101271 | -0.1300151 | 0.2505170 | -0.05190591 | 0.9655167 |
ENSG00000130021 : PUDP | -0.7621578 | 0.2543147 | 2.1551890 | -0.4343396 | -0.9649807 | -0.57820227 | 0.3675181 | -0.8682861 | -0.2739680 | 0.1057352 | -0.67961368 | 1.6787912 |
ENSG00000101846 : STS | -0.3061919 | 1.1965013 | 1.4125252 | -1.2271362 | -0.7609792 | -0.79508827 | 1.5792806 | -0.7041308 | -0.9807931 | 0.1467005 | -0.36872515 | 0.8080371 |
rownames(choose_matrix) <- unlist(lapply(strsplit(rownames(choose_matrix),":"),function(x) x[2]))
head(choose_matrix)
ERR188044 | ERR188104 | ERR188234 | ERR188245 | ERR188257 | ERR188273 | ERR188337 | ERR188383 | ERR188401 | ERR188428 | ERR188454 | ERR204916 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
PLCXD1 | 1.9215927 | 0.4896378 | 0.9650236 | -0.4910871 | -1.0940155 | -0.07754007 | 1.0742078 | -1.2302541 | -0.1886567 | -1.2003010 | -0.60123750 | 0.4326301 |
IL3RA | 1.0684102 | 1.1323672 | -1.1750052 | -1.1405668 | 0.1041351 | -1.07169002 | -0.7420654 | -0.1024952 | 1.2750405 | -1.0470912 | 0.78306348 | 0.9158973 |
ASMT | 2.1365053 | 0.6224464 | 0.5215092 | -0.6897379 | -0.1850516 | -0.89161243 | -0.8916124 | 1.5308817 | -0.3869261 | -0.7906752 | -0.68973792 | -0.2859889 |
PRKX | -0.2822280 | -0.1640627 | 1.8727853 | -0.8229841 | -1.3016534 | -1.46788583 | 1.3420432 | -0.2101271 | -0.1300151 | 0.2505170 | -0.05190591 | 0.9655167 |
PUDP | -0.7621578 | 0.2543147 | 2.1551890 | -0.4343396 | -0.9649807 | -0.57820227 | 0.3675181 | -0.8682861 | -0.2739680 | 0.1057352 | -0.67961368 | 1.6787912 |
STS | -0.3061919 | 1.1965013 | 1.4125252 | -1.2271362 | -0.7609792 | -0.79508827 | 1.5792806 | -0.7041308 | -0.9807931 | 0.1467005 | -0.36872515 | 0.8080371 |
options(repr.plot.width = 8, repr.plot.height = 6)
morecols <- colorRampPalette(c("green","black","red"))
col.sex <- c("purple","orange")[coldata$sex]
label_col = paste(coldata$sex ,'.', coldata$population)
annotation_col = data.frame(population = coldata$population,sex = coldata$sex)
rownames(annotation_col) = coldata$ids
ann_colors = list(population = c(GBR = "#7570B3", YRI = "#E7298A"),
sex = c(male = "#1B9E77", female = "#D95F02")
)
pheatmap(choose_matrix, color = morecols(100), labels_col = label_col, annotation_col = annotation_col,
annotation_colors = ann_colors , fontsize_row = 6, scale="row",
annotation_legend = T, border_color=NA,fontsize = 8, main = "genes GBR.f.vs.m")
# pheatmap(choose_matrix, color = morecols(100), labels_col = label_col, annotation_col = annotation_col,
# annotation_colors = ann_colors , fontsize_row = 6, scale="row",
# annotation_legend = T, fontsize = 8, main = "genes GBR.f.vs.m",filename = 'DEG_top50_heatmap.pdf')
pheatmap參數用法 :使用pheatmap包繪製熱圖
my_heatmap <- function(countdata,col_label,row_lable,group){
morecols <- colorRampPalette(c("red","black","green"))
col.sex <- c("purple","orange")[group]
# Plot the heatmap
heatmap.2(countdata,labCol=col_label,srtCol=15,
col=rev(morecols(50)),trace="none", main="genes GBR.f.vs.m",
ColSideColors=col.sex,colCol=col.sex,scale="row")
}
col_label <- paste(coldata$sex ,'.', coldata$population)
group <- coldata$sex
my_heatmap(choose_matrix, col_label, row_lable, group)
library("GetoptLong")
expr <- choose_matrix
colnames(expr) <- paste(coldata$sex ,'.', coldata$population)
annotation_col = data.frame(population = coldata$population,sex = coldata$sex)
rownames(annotation_col) = coldata$ids
ann_colors = list(population = c(GBR = "#7570B3", YRI = "#E7298A"),
sex = c(male = "#1B9E77", female = "#D95F02")
)
ann <- HeatmapAnnotation(annotation_col, col = ann_colors)
morecols <- colorRampPalette(c("green","black","red"))
Heatmap(expr,heatmap_legend_param = list(title= "legend", title_position = "topcenter",
legend_height=unit(2,"cm"), legend_direction="vertical"), col = morecols(100), column_title = "genes GBR.f.vs.m",
top_annotation = ann,row_names_gp = gpar(fontsize = 6, fontface = "bold") )
#註釋名稱可以使用下面的R代碼添加
for(an in colnames(annotation_col)) {
seekViewport(qq("annotation_@{an}"))
grid.text(an, unit(1, "npc") + unit(2, "mm"), 0.5, default.units = "npc", just = "left")}