20200102计算共线性区间保守的boundary

给TAD赋编号

## A2
awk '$1<10{print "Chr"0$1,$2,$3,"A2_TAD_Chr"0$1"_"$2"_"$3"_"NR}$1>=10{print "Chr"$1,$2,$3,"A2_TAD_Chr"$1"_"$2"_"$3"_"NR}' OFS="\t" ../A2-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >A2_TAD.txt
## D5 
awk '$1<10{print "Chr"0$1,$2,$3,"D5_TAD_Chr"0$1"_"$2"_"$3"_"NR}$1>=10{print "Chr"$1,$2,$3,"D5_TAD_Chr"$1"_"$2"_"$3"_"NR}' OFS="\t" ../D5-Rep1-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >D5_TAD.txt 
## K

######################
Chr01    0    2300000    A2_TAD_Chr01_1
Chr01    2300000    3950000    A2_TAD_Chr01_2
Chr01    3950000    4600000    A2_TAD_Chr01_3
Chr01    4600000    5900000    A2_TAD_Chr01_4
Chr01    5900000    6350000    A2_TAD_Chr01_5

取交集

以A2的TAD出发,获得对应的D5的共线性区域

将共线性文件做成 A2开始的bed坐标文件

  • 共线性文件存在倒位现象,把它翻转过来

cat  ../*coords.txt|awk '$8>=$7{print $6,$7,$8,$1"_"$2"_"$4"_"$5}$8<$7{print $6,$8,$7,$1"_"$2"_"$4"_"$5}' OFS="\t" |sort -k1,2n >A2_D5_cord.bed
####A2 坐标和D5坐标
Chr01    29416    30542    D5_Chr01_4139_5266
Chr01    123390792    123397958    D5_Chr01_28156_35367
Chr01    123393355    123390642    D5_Chr01_35364_38044
## Bedtools 取交集
~/software/bedtools2-2.29.0/bin/intersectBed  -a A2_TAD.txt  -b A2_D5_cord.bed  -loj  -nonamecheck  >A2TAD_coord.txt

得到的结果

Chr01   0       2300000 A2_TAD_Chr01_1  Chr01   2213    2430    D5_Chr01_52415615_52415832
Chr01   0       2300000 A2_TAD_Chr01_1  Chr01   29416   30542   D5_Chr01_4139_5266
Chr01   0       2300000 A2_TAD_Chr01_1  Chr01   112396  112814  D5_Chr01_20806218_20806636
Chr01   0       2300000 A2_TAD_Chr01_1  Chr01   1617320 1619583 D5_Chr01_21845491_21847770
Chr01   0       2300000 A2_TAD_Chr01_1  Chr01   2000204 2003301 D5_Chr01_20914167_20917268
Chr01   0       2300000 A2_TAD_Chr01_1  Chr01   589678  591118  D5_Chr01_10813400_10814848
Chr01   2300000 3950000 A2_TAD_Chr01_2  Chr01   2526431 2527455 D5_Chr01_35407230_35408251

将D5基因坐标与D5的TAD取交集

## 将D5 TAD向右延伸150kb boundary
awk '{print $1,$2,$3+150000,$4}' OFS="\t" D5_TAD.txt >D5_boundary.bed
## 将上一步取的交集的结果,换成以D5开头的bed文件
awk '$5!="."{print $5,$6,$7,$4}' OFS="\t"  A2TAD_coord.txt  >D5.bed
## 同样的取两个文件的交集
 ~/software/bedtools2-2.29.0/bin/intersectBed -a D5.bed  -b D5_boundary.bed -loj -nonamecheck >A2_D5_boundary.txt

## 提取保守的boundary
cut -f4,8 A2_D5_boundary.txt |sort |uniq|awk '$2!="."{print $0}'

输出结果

####################
A2_TAD_Chr01_0_2300000_1    D5_TAD_Chr01_0_1000000_1
A2_TAD_Chr01_0_2300000_1    D5_TAD_Chr01_1650000_2500000_3

Test 2

获取D5和A2两个基因组的boundary坐标,从第3列左右各延伸150kb

## D5 boundary坐标
awk '$1<10{print "Chr"0$1,$3-150000,$3+150000,"D5_TAD_Chr"0$1"_"$2"_"$3"_"NR}$1>=10{print "Chr"$1,$3-150000,$3+150000,"D5_TAD_Chr"$1"_"$2"_"$3"_"NR}' OFS="\t" ../D5-Rep1-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >D5_boundary.txt 
## A2 boundary坐标
awk '$1<10{print "Chr"0$1,$3-150000,$3+150000,"A2_TAD_Chr"0$1"_"$2"_"$3"_"NR}$1>=10{print "Chr"$1,$3-150000,$3+150000,"A2_TAD_Chr"$1"_"$2"_"$3"_"NR}' OFS="\t" ../A2-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >A2_boundary.txt

将共线性区块做成bed文件

## 防止存在到位现象
cat  ../*coords.txt|awk '$8>=$7{print $2,$4,$5,"A2_"$6"_"$7"_"$8}$8<$7{print $2,$4,$5,"A2_"$6"_"$8"_"$7}' OFS="\t" |sort -k1,2n >A2_D5_coords.bed
## 使用D5 boundary与共线性文件取交集 
~/software/bedtools2-2.29.0/bin/intersectBed  -a D5_boundary.txt -b A2_D5_coords.bed -loj  -nonamecheck >D5_boundary_corrds.txt
###结果
Chr01   850000  1150000 D5_TAD_Chr01_0_1000000_1        Chr01   857844  860674  A2_Chr01_122554434_122557272
Chr01   850000  1150000 D5_TAD_Chr01_0_1000000_1        Chr01   861384  864893  A2_Chr01_122547219_122550714

将A2的共线性区域与A2的boundary文件取交集

## 
awk '$5!="."{split($8,a,"_");print a[2]"\t"a[3]"\t"a[4] "\t"$4}' D5_boundary_corrds.txt >A2_coords_boundary.bed
####得到结果
Chr01    122554434    122557272    D5_TAD_Chr01_0_1000000_1
Chr01    122547219    122550714    D5_TAD_Chr01_0_1000000_1
Chr01    122539009    122540323    D5_TAD_Chr01_0_1000000_1
### 与A2 boundary文件取交集
~/software/bedtools2-2.29.0/bin/intersectBed  -a A2_coords_boundary.bed  -b A2_boundary.txt  -loj -nonamecheck >11
### 提取最终结果
cut -f4,8 11|sort |uniq >tmp
awk '$2=="."{a[$1]+=0}$2!="."{a[$1]+=1}END{for(i in a){if(a[i]==0){print i}}}' tmp|xargs  -I {} grep {} tmp|awk '{print $1"\tNone"}' >none
## 没有共线性的片段与D5的TAD有交集
awk '$5=="."{print $4"\tNone"}' D5_boundary_corrds.txt  >>none
awk '$2=="."{a[$1]+=0}$2!="."{a[$1]+=1}END{for(i in a){if(a[i]!=0){print i}}}' tmp|xargs  -I {} grep {} tmp|awk '$2!="."{print $0}' >yes
## 按照boundary序号排序
cat yes none |awk '{split($1,a,"_");print a[1]"\t"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[6]"\t"$2}'|sort -k3,3 -k4,5n|awk '{print $1"_"$2"_"$3"_"$4"_"$5"_"$6"\t"$7}' >D5_A2_converseBoundary.txt
## 统计数目
cut -f1 D5_A2_converseBoundary.txt |sort |uniq -c |awk '$1>=3{print $0}'|wc -l
cut -f1 D5_A2_converseBoundary.txt |sort |uniq -c |awk '$1==2{print $0}'|wc -l
cut -f1 D5_A2_converseBoundary.txt |sort |uniq -c |awk '$1==1{print $0}'|wc -l
grep "None" D5_A2_converseBoundary.txt |wc -l

Test3

将K基因组的TAD染色体名字对应起来

## K12 TAD文件
awk '$1==1{print "Chr05\t"$2,$3,"K12_TAD_Chr05_"$2"_"$3"_"NR}$1==2{print "Chr03\t"$2,$3,"K12_TAD_Chr03_"$2"_"$3"_"NR}$1==3{print "Chr08\t"$2,$3,"K12_TAD_Chr08_"$2"_"$3"_"NR}$1==4{print "Chr10\t"$2,$3,"K12_TAD_Chr10_"$2"_"$3"_"NR}$1==5{print "Chr06\t"$2,$3,"K12_TAD_Chr06_"$2"_"$3"_"NR}$1==6{print "Chr13\t"$2,$3,"K12_TAD_Chr13_"$2"_"$3"_"NR}$1==7{print "Chr12\t"$2,$3,"K12_TAD_Chr12_"$2"_"$3"_"NR}$1==8{print "Chr07\t"$2,$3,"K12_TAD_Chr07_"$2"_"$3"_"NR}$1==9{print "Chr01\t"$2,$3,"K12_TAD_Chr01_"$2"_"$3"_"NR}$1==10{print "Chr04\t"$2,$3,"K12_TAD_Chr04_"$2"_"$3"_"NR}$1==11{print "Chr09\t"$2,$3,"K12_TAD_Chr09_"$2"_"$3"_"NR}$1==12{print "Chr02\t"$2,$3,"K12_TAD_Chr02_"$2"_"$3"_"NR}$1==13{print "Chr11\t"$2,$3,"K12_TAD_Chr11_"$2"_"$3"_"NR}' OFS="\t"  K-HindIII-AllChrs_TAD.level0-level1-uniq_300K-3M.txt >test/K12_TAD.txt
## D5与K12的共线性文件

python ../converse_boundary.py D5_K12_collinearity.txt  D5_TAD.txt  K12_TAD.txt  D5_K12_conserve_boundarit.txt >err
## 对结果进行排序
awk '{split($1,a,"_");print a[1]"\t"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[6]"\t"$2}'|sort -k3,3 -k4,5n D5_K12_conserve_boundarit.txt >1
mv 1 D5_K12_conserve_boundarit.txt
awk '{split($1,a,"_");print a[1]"\t"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[6]"\t"$2}'|sort -k3,3 -k4,5n D5_A2_conserve_boundarit.txt >1
mv 1 D5_A2_conserve_boundarit.txt

D5的chr02与A2的chr01

使用基因的共线性block进行矫正

基因共线性区域文件

Chr01    33390691    37852369    4461679    Chr02    28159874    48601001    20441128
Chr01    38236062    39789170    1553109    Chr02    16544466    21159835    4615370
Chr01    40078980    41572241    1493262    Chr02    48835312    52514427    3679116
Chr01    41723036    52363387    10640352    Chr02    21351    17683543    17662193
## D5Chr01 && A2Chr02
awk -F "|" '{print $5,$1,$2}' D5Chr01_vs_A2_filter.delta.coords|awk '$1~/D5Chr01/&&$2~/Chr02/{print substr($1,3),$3,$4,$2,$5,$6}' OFS="\t" |sort -k1,1 -k2,3n|awk '{array[NR]=$0}END{
print array[1];
for(i=2;i<=NR-1;i++){
split(array[i-1],tmp1,"\t");
split(array[i],tmp,"\t");
split(array[i+1],tmp2,"\t");
if((tmp2[5]-tmp1[5]<2000000||tmp2[5]-tmp1[5]>-2000000)&&(tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000)){
print "#"array[i]
}else if((tmp2[5]-tmp1[5]>2000000||tmp2[5]-tmp1[5]<-2000000)&&((tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000))){
print "#"array[i]
}else{
print array[i]
}}
print array[NR]
}'
## D5Chr02 $$ A2Chr01
awk -F "|" '{print $5,$1,$2}' D5Chr02_vs_A2_filter.delta.coords|awk '$1~/D5Chr02/&&$2~/Chr01/{print substr($1,3),$3,$4,$2,$5,$6}' OFS="\t" |sort -k1,1 -k2,3n|awk '{array[NR]=$0}END{
print array[1];
for(i=2;i<=NR-1;i++){
split(array[i-1],tmp1,"\t");
split(array[i],tmp,"\t");
split(array[i+1],tmp2,"\t");
if((tmp2[5]-tmp1[5]<2000000||tmp2[5]-tmp1[5]>-2000000)&&(tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000)){
print "#"array[i]
}else if((tmp2[5]-tmp1[5]>2000000||tmp2[5]-tmp1[5]<-2000000)&&((tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000))){
print "#"array[i]
}else{
print array[i]
}}
print array[NR]
}'

挑出异常区域

## 共线性文件,假如D5与A2中染色体不一一对应的地方
cat D5_A2_collinearity.txt|awk '{array[NR]=$0}END{
print array[1];
for(i=2;i<=NR-1;i++){
split(array[i-1],tmp1,"\t");
split(array[i],tmp,"\t");
split(array[i+1],tmp2,"\t");
if((tmp2[5]-tmp1[5]<2000000||tmp2[5]-tmp1[5]>-2000000)&&(tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000)){
print "#"array[i]
}else if((tmp2[5]-tmp1[5]>2000000||tmp2[5]-tmp1[5]<-2000000)&&((tmp[5]-tmp1[5]>2000000||tmp[5]-tmp1[5]<-2000000)&&(tmp[5]-tmp2[5]>2000000||tmp[5]-tmp2[5]<-2000000))){
print "#"array[i]
}else{
print array[i]
}}
print array[NR]
}'

批量提交任务

for i in 50000 75000 100000 150000; do bsub  -J D5_A2 -n 1 -o test2/%J.${index}.out -e test2/%J.${index}.err -R span[hosts=1] python  ../converse_boundary4.py Block/D5_A2_collinearitEnd.txt  TAD/D5_TAD.txt  TAD/A2_TAD.txt test2/D5_A2_${i}_conserve.txt ${i} ; bsub  -J D5_K12 -n 1 -o test2/%J.${index}.out -e test2/%J.${index}.err -R span[hosts=1] python ../converse_boundary4.py Block/D5_K12_collinearityEnd.txt  TAD/D5_TAD.txt  TAD/K12_TAD.txt test2/D5_K12_${i}_conserve.txt ${i} ; bsub  -J A2_K12 -n 1 -o test2/%J.${index}.out -e test2/%J.${index}.err -R span[hosts=1] python ../converse_boundary4.py Block/A2_K12_collinearitEnd.txt  TAD/A2_TAD.txt TAD/K12_TAD.txt test2/A2_K12_${i}_conserve.txt ${i} ; sleep 5; done

间隔距离

conserve

None

percentage

150Kb D5 vs A2

1029

172

85%

D5 vs K12

834

367

69%

100Kb D5 vs A2

939

262

78%

D5 vs K12

681

520

56%

75Kb D5 vs A2

880

321

73%

D5 vs K12

547

654

45%

50KB D5 vs A2

750

451

62%

D5 vs K12

381

820

31%

有重组的染色体之间保守的TAD的数目

类型

保守

不保守

D5Chr01 vs A2Chr02

28

0

D5Chr02 vs A2Chr01

25

A2Chr01 vs K12Chr02

21

A2Chr02 vs K12Chr01

21

Test4

将染色体编号相同的文件合并

## 合并A和K基因组
 cat *coords.txt|awk '$1~/#/{print "#"$2,$4,$5,$6,$7,$8}$1!~/#/{print $2,$4,$5,$6,$7,$8}' OFS="\t" |cat - AChr01_KChr02_collineriat.txt AChr02_KChr01_collineriat.txt >../test/Block/A2_K12_collinearitEnd.txt
## 合并D和A基因组

提取上下游150KB进行Blast

## 提取上下游各150kb的boundarit bed文件
windowSize=150000
awk '{a[NR]=$0}END{split(a[1],tmp0,"\t");print tmp0[1]"\t"tmp0[2]"\t"tmp0[2]+"'$windowSize'""\t"tmp0[4]"_left";print tmp0[1]"\t"tmp0[3]-"'$windowSize'""\t"tmp0[3]+"'$windowSize'""\t"tmp0[4];for(i=2;i<=NR;i++){split(a[i-1],tmp1,"\t");split(a[i],tmp2,"\t");if(tmp2[2]==0){print tmp2[1]"\t"tmp2[2]"\t"tmp2[2]+"'$windowSize'""\t"tmp2[4]"_left";}else if(tmp2[2]!=tmp1[3]){print tmp2[1]"\t"tmp2[2]-"'$windowSize'""\t"tmp2[2]+"'$windowSize'""\t"tmp2[4]"_left";} print tmp2[1]"\t"tmp2[3]-"'$windowSize'""\t"tmp2[3]+"'$windowSize'""\t"tmp2[4]; }}' ../test/TAD/D5_TAD.txt

Blast建库方式可以单独每条染色体进行建库

单个染色体进行建库

## 单个染色体序列进行建库
grep ">Chr03" -A1 /public/home/cotton/public_data/Three_Genomes/A_assembly_changed.fa >A2Chr03.fa
## makeblast
makeblastdb -in A2Chr03.fa  -dbtype "nucl" -parse_seqids -out A2Chr03
## 使用blast比对
bsub -J blast -n 10 -o %J.out -e %J.err -R span[hosts=1] blastn -query D5_Chr03TAD.fasta  -db blastDB/A2Chr03  -outfmt "6  qseqid sseqid qstart qend sstart send nident pident qcovs evalue bitscore" -evalue 1e-5 -out DChr03_A2.blast_out -num_threads 10"

使用染色体坐标去筛选匹配度最高的区域

## 将染色体分成150000片段
~/software/bedtools2-2.29.0/bin/windowMaker -b A2_Chr03.bed -w 150000 >1
## 提取一个TAD的比对bed文件
grep D5_TAD_Chr03_0_750000_151_left  DChr03_A2.blast_out|sort -k5,6n|awk '$5<=$6{print $2,$5,$6}$5>$6{print $2,$6,$5}' OFS="\t"  >oneTAD.bed
## 取交集统计数目

查看对应TAD附近的共线性区域

## TAD Boundrit 为 Chr01 7825000 7975000
awk '$1=="Chr01"{print $0}' D5_A2_collinearitEnd.txt|awk '{if($2>7975000 ||$3<7825000){}else{print $0}}'|sort -k5,6 -n |less

Last updated