20200102计算共线性区间保守的boundary
给TAD赋编号
## A2
awk '$1<10{print "Chr"0$1,$2,$3,"A2_TAD_Chr"0$1"_"$2"_"$3"_"NR}$1>=10{print "Chr"$1,$2,$3,"A2_TAD_Chr"$1"_"$2"_"$3"_"NR}' OFS="\t" ../A2-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >A2_TAD.txt
## D5
awk '$1<10{print "Chr"0$1,$2,$3,"D5_TAD_Chr"0$1"_"$2"_"$3"_"NR}$1>=10{print "Chr"$1,$2,$3,"D5_TAD_Chr"$1"_"$2"_"$3"_"NR}' OFS="\t" ../D5-Rep1-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >D5_TAD.txt
## K
######################
Chr01 0 2300000 A2_TAD_Chr01_1
Chr01 2300000 3950000 A2_TAD_Chr01_2
Chr01 3950000 4600000 A2_TAD_Chr01_3
Chr01 4600000 5900000 A2_TAD_Chr01_4
Chr01 5900000 6350000 A2_TAD_Chr01_5
取交集
以A2的TAD出发,获得对应的D5的共线性区域
将共线性文件做成 A2开始的bed坐标文件
共线性文件存在倒位现象,把它翻转过来
cat ../*coords.txt|awk '$8>=$7{print $6,$7,$8,$1"_"$2"_"$4"_"$5}$8<$7{print $6,$8,$7,$1"_"$2"_"$4"_"$5}' OFS="\t" |sort -k1,2n >A2_D5_cord.bed
####A2 坐标和D5坐标
Chr01 29416 30542 D5_Chr01_4139_5266
Chr01 123390792 123397958 D5_Chr01_28156_35367
Chr01 123393355 123390642 D5_Chr01_35364_38044
## Bedtools 取交集
~/software/bedtools2-2.29.0/bin/intersectBed -a A2_TAD.txt -b A2_D5_cord.bed -loj -nonamecheck >A2TAD_coord.txt
得到的结果
Chr01 0 2300000 A2_TAD_Chr01_1 Chr01 2213 2430 D5_Chr01_52415615_52415832
Chr01 0 2300000 A2_TAD_Chr01_1 Chr01 29416 30542 D5_Chr01_4139_5266
Chr01 0 2300000 A2_TAD_Chr01_1 Chr01 112396 112814 D5_Chr01_20806218_20806636
Chr01 0 2300000 A2_TAD_Chr01_1 Chr01 1617320 1619583 D5_Chr01_21845491_21847770
Chr01 0 2300000 A2_TAD_Chr01_1 Chr01 2000204 2003301 D5_Chr01_20914167_20917268
Chr01 0 2300000 A2_TAD_Chr01_1 Chr01 589678 591118 D5_Chr01_10813400_10814848
Chr01 2300000 3950000 A2_TAD_Chr01_2 Chr01 2526431 2527455 D5_Chr01_35407230_35408251
将D5基因坐标与D5的TAD取交集
## 将D5 TAD向右延伸150kb boundary
awk '{print $1,$2,$3+150000,$4}' OFS="\t" D5_TAD.txt >D5_boundary.bed
## 将上一步取的交集的结果,换成以D5开头的bed文件
awk '$5!="."{print $5,$6,$7,$4}' OFS="\t" A2TAD_coord.txt >D5.bed
## 同样的取两个文件的交集
~/software/bedtools2-2.29.0/bin/intersectBed -a D5.bed -b D5_boundary.bed -loj -nonamecheck >A2_D5_boundary.txt
## 提取保守的boundary
cut -f4,8 A2_D5_boundary.txt |sort |uniq|awk '$2!="."{print $0}'
输出结果
####################
A2_TAD_Chr01_0_2300000_1 D5_TAD_Chr01_0_1000000_1
A2_TAD_Chr01_0_2300000_1 D5_TAD_Chr01_1650000_2500000_3
Test 2
获取D5和A2两个基因组的boundary坐标,从第3列左右各延伸150kb
## D5 boundary坐标
awk '$1<10{print "Chr"0$1,$3-150000,$3+150000,"D5_TAD_Chr"0$1"_"$2"_"$3"_"NR}$1>=10{print "Chr"$1,$3-150000,$3+150000,"D5_TAD_Chr"$1"_"$2"_"$3"_"NR}' OFS="\t" ../D5-Rep1-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >D5_boundary.txt
## A2 boundary坐标
awk '$1<10{print "Chr"0$1,$3-150000,$3+150000,"A2_TAD_Chr"0$1"_"$2"_"$3"_"NR}$1>=10{print "Chr"$1,$3-150000,$3+150000,"A2_TAD_Chr"$1"_"$2"_"$3"_"NR}' OFS="\t" ../A2-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >A2_boundary.txt
将共线性区块做成bed文件
## 防止存在到位现象
cat ../*coords.txt|awk '$8>=$7{print $2,$4,$5,"A2_"$6"_"$7"_"$8}$8<$7{print $2,$4,$5,"A2_"$6"_"$8"_"$7}' OFS="\t" |sort -k1,2n >A2_D5_coords.bed
## 使用D5 boundary与共线性文件取交集
~/software/bedtools2-2.29.0/bin/intersectBed -a D5_boundary.txt -b A2_D5_coords.bed -loj -nonamecheck >D5_boundary_corrds.txt
###结果
Chr01 850000 1150000 D5_TAD_Chr01_0_1000000_1 Chr01 857844 860674 A2_Chr01_122554434_122557272
Chr01 850000 1150000 D5_TAD_Chr01_0_1000000_1 Chr01 861384 864893 A2_Chr01_122547219_122550714
将A2的共线性区域与A2的boundary文件取交集
##
awk '$5!="."{split($8,a,"_");print a[2]"\t"a[3]"\t"a[4] "\t"$4}' D5_boundary_corrds.txt >A2_coords_boundary.bed
####得到结果
Chr01 122554434 122557272 D5_TAD_Chr01_0_1000000_1
Chr01 122547219 122550714 D5_TAD_Chr01_0_1000000_1
Chr01 122539009 122540323 D5_TAD_Chr01_0_1000000_1
### 与A2 boundary文件取交集
~/software/bedtools2-2.29.0/bin/intersectBed -a A2_coords_boundary.bed -b A2_boundary.txt -loj -nonamecheck >11
### 提取最终结果
cut -f4,8 11|sort |uniq >tmp
awk '$2=="."{a[$1]+=0}$2!="."{a[$1]+=1}END{for(i in a){if(a[i]==0){print i}}}' tmp|xargs -I {} grep {} tmp|awk '{print $1"\tNone"}' >none
## 没有共线性的片段与D5的TAD有交集
awk '$5=="."{print $4"\tNone"}' D5_boundary_corrds.txt >>none
awk '$2=="."{a[$1]+=0}$2!="."{a[$1]+=1}END{for(i in a){if(a[i]!=0){print i}}}' tmp|xargs -I {} grep {} tmp|awk '$2!="."{print $0}' >yes
## 按照boundary序号排序
cat yes none |awk '{split($1,a,"_");print a[1]"\t"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[6]"\t"$2}'|sort -k3,3 -k4,5n|awk '{print $1"_"$2"_"$3"_"$4"_"$5"_"$6"\t"$7}' >D5_A2_converseBoundary.txt
## 统计数目
cut -f1 D5_A2_converseBoundary.txt |sort |uniq -c |awk '$1>=3{print $0}'|wc -l
cut -f1 D5_A2_converseBoundary.txt |sort |uniq -c |awk '$1==2{print $0}'|wc -l
cut -f1 D5_A2_converseBoundary.txt |sort |uniq -c |awk '$1==1{print $0}'|wc -l
grep "None" D5_A2_converseBoundary.txt |wc -l
Test3
将K基因组的TAD染色体名字对应起来
## K12 TAD文件
awk '$1==1{print "Chr05\t"$2,$3,"K12_TAD_Chr05_"$2"_"$3"_"NR}$1==2{print "Chr03\t"$2,$3,"K12_TAD_Chr03_"$2"_"$3"_"NR}$1==3{print "Chr08\t"$2,$3,"K12_TAD_Chr08_"$2"_"$3"_"NR}$1==4{print "Chr10\t"$2,$3,"K12_TAD_Chr10_"$2"_"$3"_"NR}$1==5{print "Chr06\t"$2,$3,"K12_TAD_Chr06_"$2"_"$3"_"NR}$1==6{print "Chr13\t"$2,$3,"K12_TAD_Chr13_"$2"_"$3"_"NR}$1==7{print "Chr12\t"$2,$3,"K12_TAD_Chr12_"$2"_"$3"_"NR}$1==8{print "Chr07\t"$2,$3,"K12_TAD_Chr07_"$2"_"$3"_"NR}$1==9{print "Chr01\t"$2,$3,"K12_TAD_Chr01_"$2"_"$3"_"NR}$1==10{print "Chr04\t"$2,$3,"K12_TAD_Chr04_"$2"_"$3"_"NR}$1==11{print "Chr09\t"$2,$3,"K12_TAD_Chr09_"$2"_"$3"_"NR}$1==12{print "Chr02\t"$2,$3,"K12_TAD_Chr02_"$2"_"$3"_"NR}$1==13{print "Chr11\t"$2,$3,"K12_TAD_Chr11_"$2"_"$3"_"NR}' OFS="\t" K-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >test/K12_TAD.txt
## D5与K12的共线性文件
python ../converse_boundary.py D5_K12_collinearity.txt D5_TAD.txt K12_TAD.txt D5_K12_conserve_boundarit.txt >err
## 对结果进行排序
awk '{split($1,a,"_");print a[1]"\t"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[6]"\t"$2}'|sort -k3,3 -k4,5n D5_K12_conserve_boundarit.txt >1
mv 1 D5_K12_conserve_boundarit.txt
awk '{split($1,a,"_");print a[1]"\t"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[6]"\t"$2}'|sort -k3,3 -k4,5n D5_A2_conserve_boundarit.txt >1
mv 1 D5_A2_conserve_boundarit.txt
D5的chr02与A2的chr01
使用基因的共线性block进行矫正
基因共线性区域文件
Chr01 33390691 37852369 4461679 Chr02 28159874 48601001 20441128
Chr01 38236062 39789170 1553109 Chr02 16544466 21159835 4615370
Chr01 40078980 41572241 1493262 Chr02 48835312 52514427 3679116
Chr01 41723036 52363387 10640352 Chr02 21351 17683543 17662193
## D5Chr01 && A2Chr02
awk -F "|" '{print $5,$1,$2}' D5Chr01_vs_A2_filter.delta.coords|awk '$1~/D5Chr01/&&$2~/Chr02/{print substr($1,3),$3,$4,$2,$5,$6}' OFS="\t" |sort -k1,1 -k2,3n|awk '{array[NR]=$0}END{
print array[1];
for(i=2;i<=NR-1;i++){
split(array[i-1],tmp1,"\t");
split(array[i],tmp,"\t");
split(array[i+1],tmp2,"\t");
if((tmp2[5]-tmp1[5]<2000000||tmp2[5]-tmp1[5]>-2000000)&&(tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000)){
print "#"array[i]
}else if((tmp2[5]-tmp1[5]>2000000||tmp2[5]-tmp1[5]<-2000000)&&((tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000))){
print "#"array[i]
}else{
print array[i]
}}
print array[NR]
}'
## D5Chr02 $$ A2Chr01
awk -F "|" '{print $5,$1,$2}' D5Chr02_vs_A2_filter.delta.coords|awk '$1~/D5Chr02/&&$2~/Chr01/{print substr($1,3),$3,$4,$2,$5,$6}' OFS="\t" |sort -k1,1 -k2,3n|awk '{array[NR]=$0}END{
print array[1];
for(i=2;i<=NR-1;i++){
split(array[i-1],tmp1,"\t");
split(array[i],tmp,"\t");
split(array[i+1],tmp2,"\t");
if((tmp2[5]-tmp1[5]<2000000||tmp2[5]-tmp1[5]>-2000000)&&(tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000)){
print "#"array[i]
}else if((tmp2[5]-tmp1[5]>2000000||tmp2[5]-tmp1[5]<-2000000)&&((tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000))){
print "#"array[i]
}else{
print array[i]
}}
print array[NR]
}'
挑出异常区域
## 共线性文件,假如D5与A2中染色体不一一对应的地方
cat D5_A2_collinearity.txt|awk '{array[NR]=$0}END{
print array[1];
for(i=2;i<=NR-1;i++){
split(array[i-1],tmp1,"\t");
split(array[i],tmp,"\t");
split(array[i+1],tmp2,"\t");
if((tmp2[5]-tmp1[5]<2000000||tmp2[5]-tmp1[5]>-2000000)&&(tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000)){
print "#"array[i]
}else if((tmp2[5]-tmp1[5]>2000000||tmp2[5]-tmp1[5]<-2000000)&&((tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000))){
print "#"array[i]
}else{
print array[i]
}}
print array[NR]
}'
批量提交任务
for i in 50000 75000 100000 150000; do bsub -J D5_A2 -n 1 -o test2/%J.${index}.out -e test2/%J.${index}.err -R span[hosts=1] python ../converse_boundary4.py Block/D5_A2_collinearitEnd.txt TAD/D5_TAD.txt TAD/A2_TAD.txt test2/D5_A2_${i}_conserve.txt ${i} ; bsub -J D5_K12 -n 1 -o test2/%J.${index}.out -e test2/%J.${index}.err -R span[hosts=1] python ../converse_boundary4.py Block/D5_K12_collinearityEnd.txt TAD/D5_TAD.txt TAD/K12_TAD.txt test2/D5_K12_${i}_conserve.txt ${i} ; bsub -J A2_K12 -n 1 -o test2/%J.${index}.out -e test2/%J.${index}.err -R span[hosts=1] python ../converse_boundary4.py Block/A2_K12_collinearitEnd.txt TAD/A2_TAD.txt TAD/K12_TAD.txt test2/A2_K12_${i}_conserve.txt ${i} ; sleep 5; done
间隔距离
conserve
None
percentage
150Kb D5 vs A2
1029
172
85%
D5 vs K12
834
367
69%
100Kb D5 vs A2
939
262
78%
D5 vs K12
681
520
56%
75Kb D5 vs A2
880
321
73%
D5 vs K12
547
654
45%
50KB D5 vs A2
750
451
62%
D5 vs K12
381
820
31%
有重组的染色体之间保守的TAD的数目
类型
保守
不保守
D5Chr01 vs A2Chr02
28
0
D5Chr02 vs A2Chr01
25
A2Chr01 vs K12Chr02
21
A2Chr02 vs K12Chr01
21
Test4
将染色体编号相同的文件合并
## 合并A和K基因组
cat *coords.txt|awk '$1~/#/{print "#"$2,$4,$5,$6,$7,$8}$1!~/#/{print $2,$4,$5,$6,$7,$8}' OFS="\t" |cat - AChr01_KChr02_collineriat.txt AChr02_KChr01_collineriat.txt >../test/Block/A2_K12_collinearitEnd.txt
## 合并D和A基因组
提取上下游150KB进行Blast
## 提取上下游各150kb的boundarit bed文件
windowSize=150000
awk '{a[NR]=$0}END{split(a[1],tmp0,"\t");print tmp0[1]"\t"tmp0[2]"\t"tmp0[2]+"'$windowSize'""\t"tmp0[4]"_left";print tmp0[1]"\t"tmp0[3]-"'$windowSize'""\t"tmp0[3]+"'$windowSize'""\t"tmp0[4];for(i=2;i<=NR;i++){split(a[i-1],tmp1,"\t");split(a[i],tmp2,"\t");if(tmp2[2]==0){print tmp2[1]"\t"tmp2[2]"\t"tmp2[2]+"'$windowSize'""\t"tmp2[4]"_left";}else if(tmp2[2]!=tmp1[3]){print tmp2[1]"\t"tmp2[2]-"'$windowSize'""\t"tmp2[2]+"'$windowSize'""\t"tmp2[4]"_left";} print tmp2[1]"\t"tmp2[3]-"'$windowSize'""\t"tmp2[3]+"'$windowSize'""\t"tmp2[4]; }}' ../test/TAD/D5_TAD.txt
Blast建库方式可以单独每条染色体进行建库
单个染色体进行建库
## 单个染色体序列进行建库
grep ">Chr03" -A1 /public/home/cotton/public_data/Three_Genomes/A_assembly_changed.fa >A2Chr03.fa
## makeblast
makeblastdb -in A2Chr03.fa -dbtype "nucl" -parse_seqids -out A2Chr03
## 使用blast比对
bsub -J blast -n 10 -o %J.out -e %J.err -R span[hosts=1] blastn -query D5_Chr03TAD.fasta -db blastDB/A2Chr03 -outfmt "6 qseqid sseqid qstart qend sstart send nident pident qcovs evalue bitscore" -evalue 1e-5 -out DChr03_A2.blast_out -num_threads 10"
使用染色体坐标去筛选匹配度最高的区域
## 将染色体分成150000片段
~/software/bedtools2-2.29.0/bin/windowMaker -b A2_Chr03.bed -w 150000 >1
## 提取一个TAD的比对bed文件
grep D5_TAD_Chr03_0_750000_151_left DChr03_A2.blast_out|sort -k5,6n|awk '$5<=$6{print $2,$5,$6}$5>$6{print $2,$6,$5}' OFS="\t" >oneTAD.bed
## 取交集统计数目
查看对应TAD附近的共线性区域
## TAD Boundrit 为 Chr01 7825000 7975000
awk '$1=="Chr01"{print $0}' D5_A2_collinearitEnd.txt|awk '{if($2>7975000 ||$3<7825000){}else{print $0}}'|sort -k5,6 -n |less
Last updated