01编辑效率检测

环境搭建

删除环境

conda remove -n py36 --all

安装环境

conda create -n py36 python=3.6

报错

ValueError: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216 from C header, got 192 from PyObject
原因是因为numpy版本太低了，卸载了安装旧的包;更新numpy包
conda update numpy=1.16.1`
pkg_resources.DistributionNotFound: The 'kiwisolver>=1.0.1' distribution was not found and is required by matplotlib
conda install kiwisolver=1.1.0

起步

1.数据准备

双端测序数据文件 R1.fastq R2.fastq

引物文件，均为5到3方向，名称必须按照此格式TAB键分割

A1-F  gcttGCGTtggagtgagtacggtgtgcAAAGTATGCCCCTTATGGACCCT
A1-R  ctgtGCGTtgagttggatgctggatggTCCTCATCCCATTGTTCATTTCT
# windows传进服务器会多一个\r字符,需要删除
sed -i 's/\r//g' 文件名

靶标文件和对应的sgRNA,TAB键分割

A1    AAAGTATGCCCCTTATGGACCCTCTTGTGACATTATTTGGTAGTGTTCATGAAAAGCTTCCCGAGACAGGGAGCACGCGTAGTATGCTTTTTCCGAATTTTGGAAGCATGTTTAGTACAGCAGAGCCTCATGCTAGAAATGAACAATGGGATGAGGA AGACAGGGAGCACGCGTAGT  NA  NA
# windows传瑾服务器会多一个\r字符,需要删除
sed -i 's/\r//g' 文件名

懒得打NA的情况

# 输入文件格式
A1    参考序列    sgRNA
# 运行命令
sed -i  's/$/\tNA\tNA/g'  靶标序列文件 
# 命令直接对原文件进行了修改
A1    参考序列    sgRNA    NA    NA

列与列之间Tab键分隔；第二和第三个文件中对应样品名称必须相同；

2.合并双端测序数据

# 进入conda环境
conda activate CRISPResso2
# 合并的命令
flash -t 1 R1.fastq.gz R2.fastq.gz 2>&1 | tee flash.log
# 输出文件会在R1和R2文件所在的目录，如下所示
.
├── flash.log
├── hi-tom-1-sml_BKDL190837910-1a_1.fq.gz
├── hi-tom-1-sml_BKDL190837910-1a_2.fq.gz
├── MD5_hi-tom-1-sml_BKDL190837910-1a.txt
├── out.extendedFrags.fastq  # 下一步分析用到的文件
├── out.hist
├── out.histogram
├── out.notCombined_1.fastq
└── out.notCombined_2.fastq

3.反向互补R引物

# prefix变量为引物文件和靶标序列文件所在目录的名称
prefix="T1"
# 进入引物文件和靶标序列文件所在目录
cd /public/home/zpxu/djw/${prefix}
# 分别修改引物文件和靶标序列文件的文件名
mv 你的引物文件  ${prefix}.primer.txt
mv 你的靶标文件  ${prefix}.txt
##############
cat ${prefix}.primer.txt | sed 'N;s/\n/ /' | sed "s/-F//g" | awk '{print $1"\t"$2 > $1"-F.txt"}'
###################
cat ${prefix}.primer.txt | sed 'N;s/\n/ /' | sed "s/-R//g" | awk '{print ">"$3"\n"$4 > $3"-R.txt"}'
###############
for i in `ls | \grep "R" | sed "s/-R\.txt//g" | sort -V | uniq | xargs`
do
perl -nle'BEGIN {
    @map{ A, a, C, c, G, g, T, t } = ( T, t, G, g, C, c, A, a )
    }
    print /^>/ ?
        $_ :
              join //, map $map{ $_ }, split //, scalar reverse
    ' ${i}-R.txt | awk '/^>/&&NR>1{print "";}{ printf "%s",/^>/ ? $0"\t":$0 }' | sed "s/^>//g" > ${i}-RC.txt
cat ${i}-F.txt ${i}-RC.txt | sed 'N;s/\n/ /' | awk '{print $1,$2,$4}' | awk '{print $2".*"$3 > $1".txt"}'
\rm ${i}-F.txt ${i}-R.txt ${i}-RC.txt
done

4.按barcode拆分reads

for i in `ls | \grep -v "${prefix}" | sort -V | uniq | xargs`
do
for j in `cat ${i}`
do
cat ../out.extendedFrags.fastq | \grep -B1 -A2 -i "${j}" | grep -v "^--$" >> ${i%.*}.fastq
gzip ${i%.*}.fastq
done
done

5.按样品拆分扩增区域和靶标文件

cat ${prefix}.txt | awk -F"\t" 'BEGIN{OFS="\t"} {print "-a "$2" -g "$3 > $1"_description.txt"}'

6.检测编辑情况

vim barcode.lsf

编写lsf作业，使用vim命令,将下面的代码复制到barcode.lsf文件中

文件路径不能包含 ‘-’

#BSUB -q smp
#BSUB -W 168:00
#BSUB -n 1
#BSUB -R span[hosts=1]
#BSUB -o T1_cas9.out
#BSUB -e T1_cas9.err
#BSUB -J T1_cas9
conda init bash
conda activate CRISPResso2
# 进入引物文件和靶标序列文件所在目录
cd /public/home/zpxu/djw/T1
############## Analysis #####################
for i in `ls | \grep "fastq.gz" | sed "s/\.fastq\.gz//g" | sort -V | xargs`
do
echo CRISPResso -r1 ${i}.fastq.gz | cat - ${i}_description.txt | sed 'N;s/\n/ /' > ${i}.sh
bash ${i}.sh
mv ${i}.sh ./CRISPResso_on_${i}
mv CRISPResso_on_${i}.html ./CRISPResso_on_${i}
tar -Ipigz -cf CRISPResso_on_${i}.tar.gz CRISPResso_on_${i}
done
echo "Finally! For more see http://crispresso.pinellolab.partners.org/static/demo/CRISPResso_on_hdr/CRISPResso2_report.html"

提交任务
bsub <barcode.lsf`

输出结果文件目录

.
├── 1a.Read_barplot.pdf
├── 1a.Read_barplot.png
├── 1b.Alignment_pie_chart.pdf
├── 1b.Alignment_pie_chart.png
├── 1c.Alignment_barplot.pdf
├── 1c.Alignment_barplot.png
├── 2a.Nucleotide_percentage_quilt.pdf
├── 2a.Nucleotide_percentage_quilt.png
├── 2b.Nucleotide_percentage_quilt_around_sgRNA_AGTTGGATCTACAGCCAACG.pdf
├── 2b.Nucleotide_percentage_quilt_around_sgRNA_AGTTGGATCTACAGCCAACG.png
├── 3a.Indel_size_distribution.pdf
├── 3a.Indel_size_distribution.png
├── 3b.Insertion_deletion_substitutions_size_hist.pdf
├── 3b.Insertion_deletion_substitutions_size_hist.png
├── 4a.Combined_insertion_deletion_substitution_locations.pdf
├── 4a.Combined_insertion_deletion_substitution_locations.png
├── 4b.Insertion_deletion_substitution_locations.pdf
├── 4b.Insertion_deletion_substitution_locations.png
├── 4c.Quantification_window_insertion_deletion_substitution_locations.pdf
├── 4c.Quantification_window_insertion_deletion_substitution_locations.png
├── 4d.Position_dependent_average_indel_size.pdf
├── 4d.Position_dependent_average_indel_size.png
├── 9.Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.pdf
├── 9.Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.png
├── A2.sh
├── Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.txt
├── Alleles_frequency_table.zip
├── CRISPResso2_info.pickle
├── CRISPResso_mapping_statistics.txt
├── CRISPResso_on_A2.html
├── CRISPResso_quantification_of_editing_frequency.txt
├── CRISPResso_RUNNING_LOG.txt
├── Deletion_histogram.txt
├── Effect_vector_combined.txt
├── Effect_vector_deletion.txt
├── Effect_vector_insertion.txt
├── Effect_vector_substitution.txt
├── Indel_histogram.txt
├── Insertion_histogram.txt
├── Modification_count_vectors.txt
├── Nucleotide_frequency_table.txt
├── Nucleotide_percentage_table.txt
├── Quantification_window_modification_count_vectors.txt
├── Quantification_window_nucleotide_frequency_table.txt
├── Quantification_window_nucleotide_percentage_table.txt
├── Quantification_window_substitution_frequency_table.txt
├── Substitution_frequency_table.txt
└── Substitution_histogram.txt

PrevioussgRNA设计 NextHi-TOM

Last updated 6 years ago

Was this helpful?