01编辑效率检测

环境搭建

删除环境

conda remove -n py36 --all

安装环境

conda create -n py36 python=3.6

报错

  • ValueError: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216 from C header, got 192 from PyObject

    原因是因为numpy版本太低了,卸载了安装旧的包;更新numpy包

    conda update numpy=1.16.1`

  • pkg_resources.DistributionNotFound: The 'kiwisolver>=1.0.1' distribution was not found and is required by matplotlib

    conda install kiwisolver=1.1.0

起步

  • 1.数据准备

    • 双端测序数据文件 R1.fastq R2.fastq

    • 引物文件,均为5到3方向,名称必须按照此格式TAB键分割

      A1-F  gcttGCGTtggagtgagtacggtgtgcAAAGTATGCCCCTTATGGACCCT
      A1-R  ctgtGCGTtgagttggatgctggatggTCCTCATCCCATTGTTCATTTCT
      # windows传进服务器会多一个\r字符,需要删除
      sed -i 's/\r//g' 文件名
    • 靶标文件和对应的sgRNA,TAB键分割

      A1    AAAGTATGCCCCTTATGGACCCTCTTGTGACATTATTTGGTAGTGTTCATGAAAAGCTTCCCGAGACAGGGAGCACGCGTAGTATGCTTTTTCCGAATTTTGGAAGCATGTTTAGTACAGCAGAGCCTCATGCTAGAAATGAACAATGGGATGAGGA AGACAGGGAGCACGCGTAGT  NA  NA
      # windows传瑾服务器会多一个\r字符,需要删除
      sed -i 's/\r//g' 文件名
      • 懒得打NA的情况

        # 输入文件格式
        A1    参考序列    sgRNA
        # 运行命令
        sed -i  's/$/\tNA\tNA/g'  靶标序列文件 
        # 命令直接对原文件进行了修改
        A1    参考序列    sgRNA    NA    NA
列与列之间Tab键分隔;第二和第三个文件中对应样品名称必须相同;
  • 2.合并双端测序数据

    # 进入conda环境
    conda activate CRISPResso2
    # 合并的命令
    flash -t 1 R1.fastq.gz R2.fastq.gz 2>&1 | tee flash.log
    # 输出文件会在R1和R2文件所在的目录,如下所示
    .
    ├── flash.log
    ├── hi-tom-1-sml_BKDL190837910-1a_1.fq.gz
    ├── hi-tom-1-sml_BKDL190837910-1a_2.fq.gz
    ├── MD5_hi-tom-1-sml_BKDL190837910-1a.txt
    ├── out.extendedFrags.fastq  # 下一步分析用到的文件
    ├── out.hist
    ├── out.histogram
    ├── out.notCombined_1.fastq
    └── out.notCombined_2.fastq
  • 3.反向互补R引物

    # prefix变量为引物文件和靶标序列文件所在目录的名称
    prefix="T1"
    # 进入引物文件和靶标序列文件所在目录
    cd /public/home/zpxu/djw/${prefix}
    # 分别修改引物文件和靶标序列文件的文件名
    mv 你的引物文件  ${prefix}.primer.txt
    mv 你的靶标文件  ${prefix}.txt
    ##############
    cat ${prefix}.primer.txt | sed 'N;s/\n/ /' | sed "s/-F//g" | awk '{print $1"\t"$2 > $1"-F.txt"}'
    ###################
    cat ${prefix}.primer.txt | sed 'N;s/\n/ /' | sed "s/-R//g" | awk '{print ">"$3"\n"$4 > $3"-R.txt"}'
    ###############
    for i in `ls | \grep "R" | sed "s/-R\.txt//g" | sort -V | uniq | xargs`
    do
    perl -nle'BEGIN {
        @map{ A, a, C, c, G, g, T, t } = ( T, t, G, g, C, c, A, a )
        }
        print /^>/ ?
            $_ :
                  join //, map $map{ $_ }, split //, scalar reverse
        ' ${i}-R.txt | awk '/^>/&&NR>1{print "";}{ printf "%s",/^>/ ? $0"\t":$0 }' | sed "s/^>//g" > ${i}-RC.txt
    cat ${i}-F.txt ${i}-RC.txt | sed 'N;s/\n/ /' | awk '{print $1,$2,$4}' | awk '{print $2".*"$3 > $1".txt"}'
    \rm ${i}-F.txt ${i}-R.txt ${i}-RC.txt
    done
  • 4.按barcode拆分reads

    for i in `ls | \grep -v "${prefix}" | sort -V | uniq | xargs`
    do
    for j in `cat ${i}`
    do
    cat ../out.extendedFrags.fastq | \grep -B1 -A2 -i "${j}" | grep -v "^--$" >> ${i%.*}.fastq
    gzip ${i%.*}.fastq
    done
    done
  • 5.按样品拆分扩增区域和靶标文件

    cat ${prefix}.txt | awk -F"\t" 'BEGIN{OFS="\t"} {print "-a "$2" -g "$3 > $1"_description.txt"}'
  • 6.检测编辑情况

    vim barcode.lsf

    编写lsf作业,使用vim命令,将下面的代码复制到barcode.lsf文件中

    文件路径不能包含 ‘-’

    #BSUB -q smp
    #BSUB -W 168:00
    #BSUB -n 1
    #BSUB -R span[hosts=1]
    #BSUB -o T1_cas9.out
    #BSUB -e T1_cas9.err
    #BSUB -J T1_cas9
    conda init bash
    conda activate CRISPResso2
    # 进入引物文件和靶标序列文件所在目录
    cd /public/home/zpxu/djw/T1
    ############## Analysis #####################
    for i in `ls | \grep "fastq.gz" | sed "s/\.fastq\.gz//g" | sort -V | xargs`
    do
    echo CRISPResso -r1 ${i}.fastq.gz | cat - ${i}_description.txt | sed 'N;s/\n/ /' > ${i}.sh
    bash ${i}.sh
    mv ${i}.sh ./CRISPResso_on_${i}
    mv CRISPResso_on_${i}.html ./CRISPResso_on_${i}
    tar -Ipigz -cf CRISPResso_on_${i}.tar.gz CRISPResso_on_${i}
    done
    echo "Finally! For more see http://crispresso.pinellolab.partners.org/static/demo/CRISPResso_on_hdr/CRISPResso2_report.html"
  • 提交任务

    bsub <barcode.lsf`

输出结果文件目录

.
├── 1a.Read_barplot.pdf
├── 1a.Read_barplot.png
├── 1b.Alignment_pie_chart.pdf
├── 1b.Alignment_pie_chart.png
├── 1c.Alignment_barplot.pdf
├── 1c.Alignment_barplot.png
├── 2a.Nucleotide_percentage_quilt.pdf
├── 2a.Nucleotide_percentage_quilt.png
├── 2b.Nucleotide_percentage_quilt_around_sgRNA_AGTTGGATCTACAGCCAACG.pdf
├── 2b.Nucleotide_percentage_quilt_around_sgRNA_AGTTGGATCTACAGCCAACG.png
├── 3a.Indel_size_distribution.pdf
├── 3a.Indel_size_distribution.png
├── 3b.Insertion_deletion_substitutions_size_hist.pdf
├── 3b.Insertion_deletion_substitutions_size_hist.png
├── 4a.Combined_insertion_deletion_substitution_locations.pdf
├── 4a.Combined_insertion_deletion_substitution_locations.png
├── 4b.Insertion_deletion_substitution_locations.pdf
├── 4b.Insertion_deletion_substitution_locations.png
├── 4c.Quantification_window_insertion_deletion_substitution_locations.pdf
├── 4c.Quantification_window_insertion_deletion_substitution_locations.png
├── 4d.Position_dependent_average_indel_size.pdf
├── 4d.Position_dependent_average_indel_size.png
├── 9.Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.pdf
├── 9.Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.png
├── A2.sh
├── Alleles_frequency_table_around_sgRNA_AGTTGGATCTACAGCCAACG.txt
├── Alleles_frequency_table.zip
├── CRISPResso2_info.pickle
├── CRISPResso_mapping_statistics.txt
├── CRISPResso_on_A2.html
├── CRISPResso_quantification_of_editing_frequency.txt
├── CRISPResso_RUNNING_LOG.txt
├── Deletion_histogram.txt
├── Effect_vector_combined.txt
├── Effect_vector_deletion.txt
├── Effect_vector_insertion.txt
├── Effect_vector_substitution.txt
├── Indel_histogram.txt
├── Insertion_histogram.txt
├── Modification_count_vectors.txt
├── Nucleotide_frequency_table.txt
├── Nucleotide_percentage_table.txt
├── Quantification_window_modification_count_vectors.txt
├── Quantification_window_nucleotide_frequency_table.txt
├── Quantification_window_nucleotide_percentage_table.txt
├── Quantification_window_substitution_frequency_table.txt
├── Substitution_frequency_table.txt
└── Substitution_histogram.txt

Last updated