R包安装
使用R3.6.0,进行
Copy if ( ! requireNamespace( "BiocManager" , quietly = TRUE ))
install.packages( "BiocManager" )
BiocManager::install( "ballgown" )
##也可以直接使用library加载
library(ballgown )
报错
BiocManager在安装过程中没有权限
https://stackoverflow.com/questions/41839214/installation-path-not-writable-r-unable-to-update-packages 还是没有解决
Copy # Installation path not writeable
installed.packages ()[, c ( "Package" , "LibPath" )]
#查看这些包的安装路径,直接改变权限
1.加载表达文件
当stringtie生成的*.ctab
文件都在同一个目录下的时候,直接使用样本所在的父目录作为参数。samplePattern
样本的前缀,这个例子中包含有20个样本的数据
Copy # make the ballgown object:
#根据目录来提取表达
bg = ballgown( dataDir = data_directory, samplePattern = 'sample' , meas = 'all' )
bg
#直接指定样本文件夹所在路径
ballgown( samples = c (文件目录 ) ,meas = 'all' )
## ballgown instance with 100 transcripts and 20 samples
2.使用插槽访问里面的各种数据
Copy * expr( ballgown_object_name, < EXPRESSION_MEASUREMENT > )
#例如
transcript_fpkm = texpr( bg, 'FPKM' )
transcript_cov = texpr( bg, 'cov' )
whole_tx_table = texpr( bg, 'all' )
gene_expression = gexpr( bg ) ##制作多个样本的表达谱
差异表达分析
Ballgown provides a wide selection of simple, fast statistical methods for testing whether transcripts are differentially expressed between experimental conditions or across a continuous covariate (such as time).
总共15个分类,每类里有两个重复
Copy ##指定每个样本的分组信息,总共15个样,每个样有两个重复
pData(bg ) = data.frame(id = sampleNames ( bg ) , group = rep ( seq(0,14 ) , each = 2 ))
##进行两两间的差异表达分析,对第一组和第二组进行差异表达分析
##这里将所有的样品随机分成两类提,进行差异表达分析
sex = sample ( c( 'M' , 'F' ) , size=nrow ( pData(bg )), replace = TRUE )
age = sample ( 21:52, size=nrow ( pData(bg )) , replace=TRUE )
mod = model.matrix ( ~ sex + age + pData ( bg )$group + pData ( bg )$time)
mod0 = model.matrix ( ~ pData ( bg )$group + pData ( bg )$time)
adjusted_results = stattest ( bg , feature= 'transcript' , meas= 'FPKM' , mod0=mod0, mod=mod )
也可以得到matrix用于到其他模型中,例如DEseq2等。
分析两个重复间的重复性和差异表达基因
Copy ##使用ballgown读取表达量
bg = ballgown( samples = c ( "1-1-1" , "1-1-2" ) ,,meas = 'all' )
##合并成一个表达谱
gene_expression = gexpr( bg )
##计算相关性
cor.test (gene_expression[, 1 ],gene_expression[, 2 ])
##计算重复间差异表达的基因
pData( bg ) = data.frame (id = sampleNames( bg ) ,group = rep ( seq ( 0 , 1 ),each = 2 ))
stat_results = stattest( bg, feature = 'transcript' , meas = 'FPKM' , covariate = 'group' )
使用DESeq2进行差异表达分析
htseq统计比对到gene的read数目
Copy for k in ${all_sample[ @ ]};
do
bsub -q smp -n 10 -R span[hosts=1] -J ${k} -e ${k} .err -o ${k} .out "
htseq-count -f bam -n 10 -r pos -i gene_id -m union -q ${inputDir}/${k}/${k}_sort.bam /data/cotton/zhenpingliu/genome/genome_data/Ghirsutum_genome_HAU_v1.1/Ghirsutum_gene_model.gtf >${k}_count.txt"
sleep 1
done
进行差异表达分析
Copy gene_expression = as.matrix (filter_gene[, - 1 ])
row.names (gene_expression)=filter_gene[, 1 ]
cond <- factor ( rep ( 1 : 2 , each = 2 ))
dds = DESeqDataSetFromMatrix( gene_expression, DataFrame( cond ) , ~ cond )
# 指定哪一组作为control
dds $ cond <- relevel (dds $ cond, ref = "2" )
##进行差异表达分析
dds <- DESeq( dds )
res <- results( dds )
##输出到文件
res $ geneId = rownames (res)
res = res[ c ( 'geneId' , 'baseMean' , 'log2FoldChange' , 'lfcSE' , 'stat' , 'pvalue' , 'padj' )]
write.table (res,file = "test" ,quote = FALSE ,sep = "\t" ,row.names = F)
##统计差异表达基因的数目
awk '$3>=1||$3<=-1{print $0}' difference_expression.txt | awk '$6<=0.05{print $0}' | awk '{print $1"\t"$3}' | wc - l
绘制火山图
Copy ##将DESeq2数据转为data.frame
res = as.data.frame ( res )
##对上调、下调和没有变化的基因进行分类
res = mutate(res,group =pvalue )
resDown = subset ( res,log2FoldChange <=-1 & pvalue <=0.05)
resDown$group = 'down'
resUp = subset ( res,log2FoldChange > =1 & pvalue <=0.05)
resUp$group = 'up'
resnoDiffer = subset ( res,pvalue > 0.05 )
resnoDiffer$group = 'noDiff'
res2 = rbind ( resDown,resUp,resnoDiffer )
##绘制火山图和指明突变基因的位置
比较分类
参考