参考 DNA-methylation effect on cotranscriptional splicing is dependent on GC architecture of the exon–intron structure
##统一保留内含子第一个和最后一个坐标
grep RI ~/work/Alternative/result/Gh_result/CO31_32_result/evolution2/A2_AS.txt |grep PB|cut -f3,4 >1
python ~/github/zpliuCode/Isoseq3/04ASconserved/extractAScoordinate.py 1 1 2
##RI坐标
awk '{OFS="\t";print $1,$2+1,$3,$4,$6}' 2 >RI.bed
##SE坐标
awk '{OFS="\t";print $1,$2,$3+1,$4,$6}' 2 >SE.bed
##对于组成型的exon和intron,根据基因的链,赋予对于的链,对于超出原有注释范围的不管
~/software/bedtools2-2.29.0/bin/intersectBed -a constitutive_intron.bed -loj -b ../TM1_gene.bed |awk '$4!="."{OFS="\t";print $1,$2,$3,$7,$9}' >1
mv 1 constitutive_intron.bed
##分割文件
##提取每个坐标处的是否甲基化
python ~/github/zpliuCode/Isoseq3/07methylation/methylationByLocation.py ~/work/Alternative/data/Ga_genome/G.arboreum.Chr.v1.0.fasta /data/cotton/zhenpingliu/QingxinSong_GB_DNAmethlation/A2/Rep1/02deduplicate_methylation/CpG_fdr_sorted.bam 1 3
##合并所有的任务的结果
python ~/github/zpliuCode/Isoseq3/07methylation/gatherAlleventLocal.py conExon_methylation.txt
##由于基数太大可能会造成差异,只对AS基因进行比较
方法参考: Asymmetric epigenome maps of subgenomes reveal imbalanced transcription and distinct evolutionary trends in Brassica napus
##提取上游的2kb坐标
~/software/bedtools2-2.29.0/bin/flankBed -b 2000 -i TM1_gene.bed -g TM1_chromsome.bed |awk '$NF=="+"&&NR%2==1{print $0}$NF=="-"&&NR%2==0{print $0}' >Upstream2K_gene.bed
##做成40个bin的窗口
##统一用远离基因的一端做起始bin,区分正负链
awk '$NF=="-"{print $0}' Upstream2K_gene.bed |~/software/bedtools2-2.29.0/bin/windowMaker -b - -n 40 -i srcwinnum -reverse >Upstream2k_40bin.bed
#正链则不需要反向
awk '$NF=="+"{print $0}' Upstream2K_gene.bed |~/software/bedtools2-2.29.0/bin/windowMaker -b - -n 40 -i srcwinnum >>Upstream2k_40bin.bed
##提取基因某一个区域的甲基化程度
python ~/github/zpliuCode/Isoseq3/07methylation/extractMethylationRegion.py gene_40bin.bed /data/cotton/zhenpingliu/QingxinSong_GB_DNAmethlation/TM1/Rep1/02deduplicate_methylation/CpG_fdr_sorted.bam gene_40bin_CpG.txt
##合并所有的bin,求平均的甲基化水平
cat Downstream/*|sed 's/_/\t/g' |awk '{a[$2][1]+=$3;a[$2][2]+=1;}END{for(i in a){print i "\t"a[i][1]/a[i][2]}}'|sort -k1,1n|less
##区分AS基因和非AS基因的甲基化程度
##筛选保守的AS事件的坐标
python ~/github/zpliuCode/Isoseq3/07methylation/filterConstitutiveintronByKmer.py ~/work/Alternative/result/Gh_result/06ASconserved/test2/A2_vs_At/AS_kmer.txt A2RI_2_Atintron.txt A2RI_2_Atintron_filter.txt
##统计序列变异情况
python ~/github/zpliuCode/Isoseq3/07methylation/sequence/sequenceAligment.py ~/work/Alternative/data/Ga_genome/G.arboreum.Chr.v1.0.fasta ~/work/Alternative/data//Ghirsutum_genome_HAU_v1.0/Ghirsutum_genome_HAU_v1.0.fasta A2_CI.txt At_RI.txt Donor_variant.txt Acceptor_variant.txt SequenceIdentity
##序列完全保守的区域,甲基化程度的差异
python ~/github/zpliuCode/Isoseq3/07methylation/sequence/extractSequenceMthylation.py /data/cotton/zhenpingliu/QingxinSong_GB_DNAmethlation/D5/Rep1/02deduplicate_methylation/CpG_fdr_sorted.bam /data/cotton/zhenpingliu/QingxinSong_GB_DNAmethlation/TM1/Rep1/02deduplicate_methylation/CpG_fdr_sorted.bam D5RI_2_DtCI_identity.txt 1
##筛选胞嘧啶保守的位置
python ~/github/zpliuCode/Isoseq3/07methylation/sequence/conservedCytosine.py ~/work/Alternative/data/Ga_genome/G.arboreum.Chr.v1.0.fasta ~/work/Alternative/data/Ghirsutum_genome_HAU_v1.0/Ghirsutum_genome_HAU_v1.0.fasta ../coordinate/A2_homolog_gene.bed ../coordinate/At_homolog_gene.bed A2_At_conservedCystoin.txt
##获取区域内CG碱基对数目,以及满足read数大于3的CG碱基对数目
python ~/github/zpliuCode/Isoseq3/07methylation/sequence/geneWithMethylatedCytosinesCount.py CpG1.bam CpG2.bam geneLocation.txt genome.fa
##筛选同源基因长度CG满足40%的覆盖度
##对每个保守的胞嘧啶位点进行差异分析
python ~/github/zpliuCode/Isoseq3/07methylation/sequence/DmCG.py conservedCytosine.txt conservedCytosineRatio.txt CpG_Bam
CpG_Bam CpG_Bam CpG_Bam
##获取每对同源基因CG甲基化的差异程度
awk '{a[$5][1]+=1}$11<=0.01{a[$5][2]+=1}END{for(i in a){print i"\t"a[i][1]"\t"a[i][2]}}' conservedCytosineCpG.txt|awk '$3==""{print $1"\t"$2"\t0\t0.0"}$3!=""{print $0"\t"$3/$2}'
# result/Gh_result/06ASconserved/02conservedAS/quantifyASconserveRatio
##获取同源基因AS数目的差值
awk '{print $1"\t"$2"\t"sqrt(($3-$4)*($3-$4))}' D5_vs_Dt_AScount_DmCG.txt
##提取变异位点左右各10bp的序列进行motif富集分析
python ~/github/zpliuCode/Isoseq3/07methylation/sequence/extractNearSequenceVarant.py genome1.fa genom2.fa sequenceVariant.txt motif20bp.fa
##进行motif 富集分析
A2RI : AAGMCCTGSKAAYMTG
A2CI : GGMCHAGCCTRTCACT
AtRI : TCTTYCYTTYHC
AtCI : AARARGAARRWRRAAR
D5RI : VCCHYCYCCYCCCCC
D5CI : NDCCCSMCCCCCCCC
DtRI : MTTTTTTTTTTT
DtCI : YGTTYGRGRMAWGAA