# python2
python ~/software/MatchAnnot/matchAnnot.py --gtf ~/work/Alternative/data/Gr_genome/Graimondii_221_v2.1.gene.gtf --format alt ./test.sam >test/annote.out
# 加入环境变量
export PATH=$PATH:~/software/cDNA_Cupcake/sequence/
# 必须切换cDNA_Cupcake目录才能运行
cd ~/software/cDNA_Cupcake/
sam_to_gff3.py -h
sam_to_gff3.py -s "标识符" ~/work/Alternative/result/Gr_result/CO41_42_result/06_Alignment/test.sam
# gff3转gtf
~/scripte/gff2gtf_cufflinks test.gff3 -T -o test.gtf
## 将所有的PacBioisform与基因编号提取出来
python classAltern.py splice.ascode.list.txt 11
awk '$6~/PB/{print $6"\t"$2}' 11 >2
awk '$7~/PB/{print $7"\t"$2}' 11 >>2
## 将每条PacBio序列的坐标提取
cut -f1 2|awk -F "|" '{print $2"\t"$1}' |sed -e 's/([^)])//g' -e 's/:/\t/g' -e 's/-/\t/g'|
## 提取对应每个基因的坐标,使用自己写的脚本
python $0 gfffile geneIDfile outfile
### 检查一下结果是否是对的,没有输出则是正确的
paste 2 3|awk '$2!=$3{print $0}'
### 提取出错的比对isfrom编号
cut -f1 2|awk -F "|" '{print $2"\t"$1}' |sed -e 's/([^)])//g' -e 's/:/\t/g' -e 's/-/\t/g'|paste - 3 |awk '{if($7-$2>3000||$3-$8>3000){print $0}}' |less
## 开始过滤比对出错的转录本对应的剪切事件
cut -f1 2|awk -F "|" '{print $2"\t"$1}' |sed -e 's/([^)])//g' -e 's/:/\t/g' -e 's/-/\t/g'|paste - 3 |awk '{if($7-$2>3000||$3-$8>3000){print $0}}' |cut -f4|sort |uniq |xargs -I {} grep "{}|" 11 >error_align.txt
## 过滤,最后文件为end文件
cat error_align.txt 11 |sort |uniq -u >end
## 只使用坐标不关心是哪两个isform直接形成的
cut -f 1,2,3,4,5 end |sort|uniq >end_second
## 改进了方法,使用sort加uniq,对于基因坐标相同的事件,保留了对应的转录本信息
awk '$1~/[^c]/{print $0}' end|awk '{print $8,$6,$7,$1,$2,$3,$4,$5}' OFS="\t"|sort -r -k7,8|uniq -f 3|awk '{print $4,$5,$6,$7,$8,$2,$3,$1}' OFS="\t"|sort -n -k4,5 >end_third
###########
for i in 1; do
grep ExonS end_second|wc -l;
grep ExonS end_second|awk '{print $2}' |sort|uniq |wc -l;
grep IntronR end_second|wc -l;
grep IntronR end_second|awk '{print $2}' |sort|uniq |wc -l;
grep AltD end_second|wc -l;
grep AltD end_second|awk '{print $2}' |sort|uniq |wc -l;
grep AltA end_second|wc -l;
grep AltA end_second|awk '{print $2}' |sort|uniq |wc -l;
grep AltP end_second|wc -l;
grep AltP end_second|awk '{print $2}' |sort|uniq |wc -l;
grep Other end_second|wc -l;
grep Other end_second|awk '{print $2}' |sort|uniq |wc -l; done
alternative_splice.py -i GMAP比对后的文件 -g 参考基因组gtf文件 -f 参考基因组序列 -as -ats T -op -os -t exon -c 0.95 -ave 3
awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" $7 "\t" $8 "\t" $11 " " $12 " " $9 " " $10}' cufflinks转化后的gtf文件 >最后可以使用的gtf文件
awk -F "\t" '{print $9 }' splice.ascode.list.txt|cut -d \; -f 3|sed 's/structure //g' |sed 's/,/\t/g' >111
如果基因组注释的转录本不准确的话,会影响AS事件的鉴定,所以使用07/merge.gtf
文件取鉴定可变剪切事件
## merge.gtf文件需要改造一下,将PacBio的基因编号改成基因组的编号
sed 's/EVM_prediction_/evm\.TU\./g' ~/work/Alternative/result/Ga_result/CO11_12_result/07_annotation/merge.gtf|awk -F "\t" '$9~/evm/{print $9$1,$2,$3,$4,$5,$6,$7,$8}' OFS="\t" |awk -F ";" '{print $4,$3";"$2";"}' OFS="\t"|sed 's/orginal_//' >./../A2_merge_cahnge.gtf
awk -F "\t" '$9~/Gor/{print $9$1,$2,$3,$4,$5,$6,$7,$8}' OFS="\t" ~/work/Alternative/result/Gr_result/CO41_42_result/07_annotation/merge.gtf| awk -F ";" '{print $4,$3";"$2";"}' OFS="\t"|sed 's/orginal_//' >./../D5_merge_change.gtf
awk -F "\t" '$9~/Gh/{print $9$1,$2,$3,$4,$5,$6,$7,$8}' OFS="\t" ~/work/Alternative/result/Gh_result/CO31_32_result/07_annotation/merge.gtf| awk -F ";" '{print $4,$3";"$2";"}' OFS="\t"|sed 's/orginal_//' >./../TM-1_merge_change.gtf