01编辑效率检测
环境搭建
删除环境
conda remove -n py36 --all
安装环境
conda create -n py36 python=3.6
报错
ValueError: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216 from C header, got 192 from PyObject
原因是因为numpy版本太低了,卸载了安装旧的包;更新numpy包
conda update numpy=1.16.1`
pkg_resources.DistributionNotFound: The 'kiwisolver>=1.0.1' distribution was not found and is required by matplotlib
conda install kiwisolver=1.1.0
起步
1.数据准备
双端测序数据文件 R1.fastq R2.fastq
引物文件,均为5到3方向,名称必须按照此格式TAB键分割
A1-F gcttGCGTtggagtgagtacggtgtgcAAAGTATGCCCCTTATGGACCCT A1-R ctgtGCGTtgagttggatgctggatggTCCTCATCCCATTGTTCATTTCT # windows传进服务器会多一个\r字符,需要删除 sed -i 's/\r//g' 文件名
靶标文件和对应的sgRNA,TAB键分割
A1 AAAGTATGCCCCTTATGGACCCTCTTGTGACATTATTTGGTAGTGTTCATGAAAAGCTTCCCGAGACAGGGAGCACGCGTAGTATGCTTTTTCCGAATTTTGGAAGCATGTTTAGTACAGCAGAGCCTCATGCTAGAAATGAACAATGGGATGAGGA AGACAGGGAGCACGCGTAGT NA NA # windows传瑾服务器会多一个\r字符,需要删除 sed -i 's/\r//g' 文件名
懒得打NA的情况
# 输入文件格式 A1 参考序列 sgRNA # 运行命令 sed -i 's/$/\tNA\tNA/g' 靶标序列文件 # 命令直接对原文件进行了修改 A1 参考序列 sgRNA NA NA
列与列之间Tab键分隔;第二和第三个文件中对应样品名称必须相同;
2.合并双端测序数据
# 进入conda环境 conda activate CRISPResso2 # 合并的命令 flash -t 1 R1.fastq.gz R2.fastq.gz 2>&1 | tee flash.log # 输出文件会在R1和R2文件所在的目录,如下所示 . ├── flash.log ├── hi-tom-1-sml_BKDL190837910-1a_1.fq.gz ├── hi-tom-1-sml_BKDL190837910-1a_2.fq.gz ├── MD5_hi-tom-1-sml_BKDL190837910-1a.txt ├── out.extendedFrags.fastq # 下一步分析用到的文件 ├── out.hist ├── out.histogram ├── out.notCombined_1.fastq └── out.notCombined_2.fastq
3.反向互补R引物
# prefix变量为引物文件和靶标序列文件所在目录的名称 prefix="T1" # 进入引物文件和靶标序列文件所在目录 cd /public/home/zpxu/djw/${prefix} # 分别修改引物文件和靶标序列文件的文件名 mv 你的引物文件 ${prefix}.primer.txt mv 你的靶标文件 ${prefix}.txt ############## cat ${prefix}.primer.txt | sed 'N;s/\n/ /' | sed "s/-F//g" | awk '{print $1"\t"$2 > $1"-F.txt"}' ################### cat ${prefix}.primer.txt | sed 'N;s/\n/ /' | sed "s/-R//g" | awk '{print ">"$3"\n"$4 > $3"-R.txt"}' ############### for i in `ls | \grep "R" | sed "s/-R\.txt//g" | sort -V | uniq | xargs` do perl -nle'BEGIN { @map{ A, a, C, c, G, g, T, t } = ( T, t, G, g, C, c, A, a ) } print /^>/ ? $_ : join //, map $map{ $_ }, split //, scalar reverse ' ${i}-R.txt | awk '/^>/&&NR>1{print "";}{ printf "%s",/^>/ ? $0"\t":$0 }' | sed "s/^>//g" > ${i}-RC.txt cat ${i}-F.txt ${i}-RC.txt | sed 'N;s/\n/ /' | awk '{print $1,$2,$4}' | awk '{print $2".*"$3 > $1".txt"}' \rm ${i}-F.txt ${i}-R.txt ${i}-RC.txt done
4.按barcode拆分reads
for i in `ls | \grep -v "${prefix}" | sort -V | uniq | xargs` do for j in `cat ${i}` do cat ../out.extendedFrags.fastq | \grep -B1 -A2 -i "${j}" | grep -v "^--$" >> ${i%.*}.fastq gzip ${i%.*}.fastq done done
5.按样品拆分扩增区域和靶标文件
cat ${prefix}.txt | awk -F"\t" 'BEGIN{OFS="\t"} {print "-a "$2" -g "$3 > $1"_description.txt"}'
6.检测编辑情况
vim barcode.lsf
编写lsf作业,使用vim命令,将下面的代码复制到barcode.lsf文件中
文件路径不能包含 ‘-’
#BSUB -q smp #BSUB -W 168:00 #BSUB -n 1 #BSUB -R span[hosts=1] #BSUB -o T1_cas9.out #BSUB -e T1_cas9.err #BSUB -J T1_cas9 conda init bash conda activate CRISPResso2 # 进入引物文件和靶标序列文件所在目录 cd /public/home/zpxu/djw/T1 ############## Analysis ##################### for i in `ls | \grep "fastq.gz" | sed "s/\.fastq\.gz//g" | sort -V | xargs` do echo CRISPResso -r1 ${i}.fastq.gz | cat - ${i}_description.txt | sed 'N;s/\n/ /' > ${i}.sh bash ${i}.sh mv ${i}.sh ./CRISPResso_on_${i} mv CRISPResso_on_${i}.html ./CRISPResso_on_${i} tar -Ipigz -cf CRISPResso_on_${i}.tar.gz CRISPResso_on_${i} done echo "Finally! For more see http://crispresso.pinellolab.partners.org/static/demo/CRISPResso_on_hdr/CRISPResso2_report.html"
提交任务
bsub <barcode.lsf`
输出结果文件目录
.
├── 1a.Read_barplot.pdf
├── 1a.Read_barplot.png
├── 1b.Alignment_pie_chart.pdf
├── 1b.Alignment_pie_chart.png
├── 1c.Alignment_barplot.pdf
├── 1c.Alignment_barplot.png
├── 2a.Nucleotide_percentage_quilt.pdf
├── 2a.Nucleotide_percentage_quilt.png
├── 2b.Nucleotide_percentage_quilt_around_sgRNA_AGTTGGATCTACAGCCAACG.pdf
├── 2b.Nucleotide_percentage_quilt_around_sgRNA_AGTTGGATCTACAGCCAACG.png
├── 3a.Indel_size_distribution.pdf
├── 3a.Indel_size_distribution.png
├── 3b.Insertion_deletion_substitutions_size_hist.pdf
├── 3b.Insertion_deletion_substitutions_size_hist.png
├── 4a.Combined_insertion_deletion_substitution_locations.pdf
├── 4a.Combined_insertion_deletion_substitution_locations.png
├── 4b.Insertion_deletion_substitution_locations.pdf
├── 4b.Insertion_deletion_substitution_locations.png
├── 4c.Quantification_window_insertion_deletion_substitution_locations.pdf
├── 4c.Quantification_window_insertion_deletion_substitution_locations.png
├── 4d.Position_dependent_average_indel_size.pdf
├── 4d.Position_dependent_average_indel_size.png
├── 9.Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.pdf
├── 9.Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.png
├── A2.sh
├── Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.txt
├── Alleles_frequency_table.zip
├── CRISPResso2_info.pickle
├── CRISPResso_mapping_statistics.txt
├── CRISPResso_on_A2.html
├── CRISPResso_quantification_of_editing_frequency.txt
├── CRISPResso_RUNNING_LOG.txt
├── Deletion_histogram.txt
├── Effect_vector_combined.txt
├── Effect_vector_deletion.txt
├── Effect_vector_insertion.txt
├── Effect_vector_substitution.txt
├── Indel_histogram.txt
├── Insertion_histogram.txt
├── Modification_count_vectors.txt
├── Nucleotide_frequency_table.txt
├── Nucleotide_percentage_table.txt
├── Quantification_window_modification_count_vectors.txt
├── Quantification_window_nucleotide_frequency_table.txt
├── Quantification_window_nucleotide_percentage_table.txt
├── Quantification_window_substitution_frequency_table.txt
├── Substitution_frequency_table.txt
└── Substitution_histogram.txt
Last updated