统计各个棉种中剪切数据
awk '$3~/IntronR/{a+=1}$3~/ExonS/{b+=1}$3~/AltA/{c+=1}$3~/AltD/{d+=1}$3~/AltP/{e+=1}$3~/Other/{f+=1}END{print a,b,c,d,e,f}' OFS="\t" end_third
TM-1去除了scaffold
对应的基因数
for i in IntronR ExonS AltA AltD AltP Other
do
awk '$3=="'"$i"'"{print $2}' end_third|sort |uniq|wc -l
done
统计每个isform发生IR的次数
提取各个棉种中同源基因的剪切事件
cut -f4 ../A2_vs_D5_collinearity.txt|xargs -I {} grep {} ../../D5/end_third|awk '{print $1,$4,$5,$2"-"$1"-"$4"-"$5,$3}' OFS="\t"
从文件夹中static.txt文件中提取
提取独有的基因对应的剪切事件
awk '{print $1"\t"$3}' At_vs_Dt_collinerity.txt|cat - ~/work/Alternative/result/duplicate_gene/At_Dt/At_vs_Dt.gff|awk '$2~/Ghir_A/{print $2}' |sort |uniq -u
进行Blast保守性分析
提取左右两端300bp序列
#提取5'端序列
awk -F "\t" '$4=="+"{print $1,$2-300,$2-1,$4,$5}$4=="-"{print $1,$3+1,$3+300,$4,$5}' OFS="\t" ../D5_IR.txt >5_sequence.bed
#提取3’端序列
awk -F "\t" '$4=="-"{print $1,$2-300,$2-1,$4,$5}$4=="+"{print $1,$3+1,$3+300,$4,$5}' OFS="\t" ../D5_IR.txt >3_sequence.bed
合并两端序列 脚本位置 ~/script/Alternative/merge.py
进行All-vs-All blast
file="A2_D5"
echo "blastn -query blastDB/${file}.fasta -db blastDB/${file} -evalue 1e-5 -num_threads 10 -outfmt 6 -out ${file}.blast"
file="A2_D5"
bsub -J Dt_Dt -q "normal" -n 10 -R span[hosts=1] -e Dt_Dt.err -o Dt_Dt.out "`echo blastn -query blastDB/${file}.fasta -db blastDB/${file} -evalue 1e-5 -num_threads 10 -outfmt \'6 qseqid sseqid qstart qend sstart send nident pident qcovs evalue bitscore\' -out ${file}.blast`"
提取保守的IR
awk '$7>200&&$8>90&&$9>85{print $0}'
使用intron比对到gene上
1.提取对应的同源基因序列
cut -f3 A2_vs_At_collinearity.txt|xargs -I {} grep {} ~/work/Alternative/data/Ga_genome/G.arboreum.Chr
.v1.0.gff|awk '$3~/gene/{print $1,$4,$5,$7,$9}' OFS="\t" |awk -F ";" '{print $1}' |sed 's/ID=//g' |awk '{print $1,$2,$3,$4,"gene-"$5"-"$1"-"$2"-"$3"-"$4}' OFS="\t" >A2_gene.bed
2.提取对应的IR序列
3.将intron序列比对到基因区域,进行筛选
python /public/home/zpliu/work/Alternative/result/duplicate_gene/IR_align_to_gene.py -homologous ../A2_vs_At_collinearity.txt -Blast 1 -out 2
#提取比对到基因区域的IR
awk '$4~/evm/{print $1,$2,$3,"-",$4}' OFS="\t" 2 >A2_IR_to_Atgene.bed
awk '$4~/Ghir/{print $1,$2,$3,"-",$4}' OFS="\t" 2 >At_IR_to_A2gene.bed
这里发现有些IR的intron不能够比对到对应的同源基因上,不知道是不是筛选指标太严格了
把指标松一些,比对上的片段占intron的0.8即可。
#与对应同源基因的IR取交集
~/software/bedtools2-2.29.0/bin/intersectBed -loj -a A2_IR_to_Atgene.bed -b ../At_IR.txt |cut -f5,10 >3
#类似
~/software/bedtools2-2.29.0/bin/intersectBed -loj -a At_IR_to_A2gene.bed -b ../A2_IR.txt|cut -f5,10 >4
#合并两个文件
awk '$2!="."{print $2"\t"$1}$2=="."{print $0}' 4|cat - 3|sort |uniq >A2_At_conserveIR.txt
evm.TU.Ga01G0020-Chr01-129583-130080-- .
evm.TU.Ga01G0021-Chr01-142321-143057-+ .
evm.TU.Ga01G0021-Chr01-142321-143808-+ Ghir_A01G000210-Ghir_A01-157898-158012-+
evm.TU.Ga01G0021-Chr01-142321-143808-+ Ghir_A01G000210-Ghir_A01-158029-158264-+
Ghir_A13G024820-Ghir_A13-108246222-108246433-- .
Ghir_A13G024830-Ghir_A13-108259086-108259204-+ .
其中点表示,对应同源基因序列类似,但是附近没有IR事件
不是点则代表保守的intronR
没有比对上的intron
感觉应该也不会保守,就不管了;直接统计出来
cut -f5 A2_IR_to_Atgene.bed|sort |uniq >666
cut -f5 ../A2_IR.txt|sort |uniq >777
cat 666 777|sort |uniq -u >A2_unmapping_IR
cut -f5 At_IR_to_A2_gene.bed|sort |uniq >666
cut -f5 ../At_IR.txt|sort |uniq >777
cat 666 777|sort |uniq -u >At_unmapping_IR
整套流程
file1=At
file2=Dt
cat ../${file1}_gene.bed.fa ../${file2}_gene.bed.fa >${file1}_${file2}_gene.fa
cat ../${file1}_IR.txt.fa ../${file2}_IR.txt.fa >${file1}_${file2}_IR.fa
makeblastdb -dbtype "nucl" -in ${file1}_${file2}_gene.fa -out ${file1}_${file2}
blastn -query ${file1}_${file2}_IR.fa -db ${file1}_${file2} -evalue 1e-5 -num_threads 10 -outfmt "6 qseqid sseqid qstart qend sstart send nident pident qcovs evalue bitscore" -out 1
python /public/home/zpliu/work/Alternative/result/duplicate_gene/IR_align_to_gene.py -homologous ../${file1}_vs_${file2}_collinearity.txt -Blast 1 -out 2
##这里要个性化一些,哪个命名在前,awk中就用哪个基因编号
awk '$4~/evm/{print $1,$2,$3,"-",$4}' OFS="\t" 2 >${file1}_IR_to_${file2}gene.bed
awk '$4~/Ghir/{print $1,$2,$3,"-",$4}' OFS="\t" 2 >${file2}_IR_to_${file1}gene.bed
###
~/software/bedtools2-2.29.0/bin/intersectBed -loj -a ${file1}_IR_to_${file2}gene.bed -b ../${file2}_IR.txt |cut -f5,10 >3
~/software/bedtools2-2.29.0/bin/intersectBed -loj -a ${file2}_IR_to_${file1}gene.bed -b ../${file1}_IR.txt|cut -f5,10 >4
awk '$2!="."{print $2"\t"$1}$2=="."{print $0}' 4|cat - 3|sort |uniq >${file1}_${file2}_conserveIR.txt
cut -f5 ${file1}_IR_to_${file2}gene.bed|sort |uniq >666
cut -f5 ../${file1}_IR.txt|sort |uniq >777
cat 666 777|sort |uniq -u >${file1}_unmapping_IR
cut -f5 ${file2}_IR_to_${file1}gene.bed|sort |uniq >666
cut -f5 ../${file2}_IR.txt|sort |uniq >777
cat 666 777|sort |uniq -u >${file2}_unmapping_IR