Bash练习
2019-11-19
awk 入门
1.提取gff文件中exon对应的染色体坐标和对应的exon编号
Ghir_A11 Ghir_EVM gene 48563862 48571977 . - . ID=Ghir_A11G021610;Name=Ghir_A11G021610
Ghir_A11 Ghir_EVM mRNA 48563862 48571977 . - . ID=Ghir_A11G021610.1;Parent=Ghir_A11G021610
Ghir_A11 Ghir_EVM three_prime_UTR 48563862 48565187 . - . ID=Ghir_A11G021610.1.utr3p1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM CDS 48565188 48565874 . - 0 ID=cds.Ghir_A11G021610.1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM exon 48563862 48565874 . - . ID=Ghir_A11G021610.1.exon8;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM CDS 48565967 48566053 . - 0 ID=cds.Ghir_A11G021610.1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM exon 48565967 48566053 . - . ID=Ghir_A11G021610.1.exon7;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM CDS 48566135 48566329 . - 0 ID=cds.Ghir_A11G021610.1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM exon 48566135 48566329 . - . ID=Ghir_A11G021610.1.exon6;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM CDS 48566416 48566538 . - 0 ID=cds.Ghir_A11G021610.1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM exon 48566416 48566538 . - . ID=Ghir_A11G021610.1.exon5;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM CDS 48566625 48566711 . - 0 ID=cds.Ghir_A11G021610.1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM exon 48566625 48566711 . - . ID=Ghir_A11G021610.1.exon4;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM CDS 48566811 48567275 . - 0 ID=cds.Ghir_A11G021610.1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM exon 48566811 48567275 . - . ID=Ghir_A11G021610.1.exon3;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM CDS 48568656 48568738 . - 1 ID=cds.Ghir_A11G021610.1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM exon 48568656 48568738 . - . ID=Ghir_A11G021610.1.exon2;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM CDS 48571600 48571894 . - 0 ID=cds.Ghir_A11G021610.1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM exon 48571600 48571977 . - . ID=Ghir_A11G021610.1.exon1;Parent=Ghir_A11G021610.1
Ghir_A11 Ghir_EVM five_prime_UTR 48571895 48571977 . - . ID=Ghir_A11G021610.1.utr5p1;Parent=Ghir_A11G021610.1
结果如下:
Ghir_A11 48566135 48566329 - Ghir_A11G021610.1.exon6
参考:
使用awk正则表达式匹配 exon https://www.w3cschool.cn/awk/8xet1k8a.html
2019-11-30
sed入门
使用sed删除1~10行内容
输出gff文件中exon所在的行
紧接上一个结果,将逗号
;
后的字符串全部删除提示使用正则表达式将;好后面的替换为空
将一行中所有的
Ghir
全都换成TM-1
参考:
正则表达式比较通俗有趣的版本 https://www.zhihu.com/question/19676915
sed 中常用的几种模式
d
p
s
进行全局匹配模式的关键字 g
2019-12-9
使用awk将改变fasta格式
awk 'NR==1&&$1~/^>/{print $0}NR>=2&&$1~/^>/{print "\n"$0}$1~/^[^>]/{printf $0}'
原始的fasta格式,基因序列在多行
>Ghir_A01G000010.1
ATGTTGAAACAAGATGGAACTCTGTGTTCCTTCTCACCGTGCATGGAGCAAGTGCAACGTTCATGTGAAA
CTCTGAGATCTGACTTTATAGAGATATATGGACCTTTGAAATACTGCTCCGCATGTATGAAATCTGTGAA
TGGAAAATGGATCACTCGAAGGTCAATGATGGGAATTCCATTGCATGCTCTCCACACAAGAGGAGGCCGC
CTTCAAGTGAAGCAAGTGTGGGGGACAATGCAAGTTCTCCGAGAATCATGGCTTGGCCATCTGCTGAAAC
TCGAGGGCATACTGGATATTTGA
修改后的fasta格式,基因序列在同一行
>Ghir_A01G000010.1
ATGTTGAAACAAGATGGAACTCTGTGTTCCTTCTCACCGTGCATGGAGCAAGTGCAACGTTCATGTGAAACTCTGAGATCTGACTTTATAGAGATATATGGACCTTTGAAATACTGCTCCGCATGTATGAAATCTGTGAATGGAAAATGGATCACTCGAAGGTCAATGATGGGAATTCCATTGCATGCTCTCCACACAAGAGGAGGCCGCCTTCAAGTGAAGCAAGTGTGGGGGACAATGCAAGTTCTCCGAGAATCATGGCTTGGCCATCTGCTGAAACTCGAGGGCATACTGGATATTTGA
将注释信息与基因坐标信息放一起
输入数据
awk '$0~/Alignment/{a=$0}$1~/^[^#]/{print $0"\t"a}' 2
输出数据
将注释信息单独放在一行
awk '{print "## "$7"\n"$0}' link.txt |awk 'NR==1{tmp=$0;print tmp}$1~/^#/{if(tmp!=$0){tmp=$0;print tmp}}$1~/^[^#]/{print $0}' |less
比较上下几行范围的内容
找出异常值
awk '{array[NR]=$0}END{
for(i=2;i<=NR-1;i++){
split(array[i-1],tmp1,"\t");
split(array[i],tmp,"\t");
split(array[i+1],tmp2,"\t");
if((tmp2[7]-tmp1[7]<2000000||tmp2[7]-tmp1[7]>-2000000)&&(tmp[7]-tmp1[7]>2000000||tmp[7]-tmp1[7]<-2000000)&&(tmp[7]-tmp2[7]>2000000||tmp[7]-tmp2[7]<-2000000)){
print array[i]
}else if((tmp2[7]-tmp1[7]>2000000||tmp2[7]-tmp1[7]<-2000000)&&(tmp[7]-tmp1[7]>2000000||tmp[7]-tmp1[7]<-2000000)&&(tmp[7]-tmp2[7]>2000000||tmp[7]-tmp2[7]<-2000000)){
print array[i]
}else{
}}
}' D5Chr01_vs_A2_coords.txt
除异常值以外的值
awk '{array[NR]=$0}END{
print array[1];
for(i=2;i<=NR-1;i++){
split(array[i-1],tmp1,"\t");
split(array[i],tmp,"\t");
split(array[i+1],tmp2,"\t");
if((tmp2[7]-tmp1[7]<2000000||tmp2[7]-tmp1[7]>-2000000)&&(tmp[7]-tmp1[7]>2000000||tmp[7]-tmp1[7]<-2000000)&&(tmp[7]-tmp2[7]>2000000||tmp[7]-tmp2[7]<-2000000)){
}else if((tmp2[7]-tmp1[7]>2000000||tmp2[7]-tmp1[7]<-2000000)&&((tmp[7]-tmp1[7]>2000000||tmp[7]-tmp1[7]<-2000000)&&(tmp[7]-tmp2[7]>2000000||tmp[7]-tmp2[7]<-2000000))){
}else{
print array[i]
}}
print array[NR]
}' D5Chr01_vs_A2_coords.txt
2020-01-19
根据gff文件提取gene body 区域坐标和上游 3Kb 的promoter 区域坐标
## 提取基因坐标
awk -F ";" '{print $1}' ../../Ghirsutum_gene_model.gff3|awk -F "\t" '$3~/gene/{print $1,$4,$5,$9}' OFS="\t" |sed 's/ID=//g' >gene.bed
## 提取promoter区域
awk -F ";" '{print $1}' ../../Ghirsutum_gene_model.gff3|awk -F "\t" '$3~/gene/&&$7=="-"{print $1,$5+1,$5+3000,$9}$3~/gene/&&$7=="+"{if($4>3000){print $1,$4-3000,$4-1,$9}else{print $1,1,$4-1,$9}}' OFS="\t" |sed 's/ID=//g' >gene_promter.bed
Last updated