Skip to content
Snippets Groups Projects
snakefile 13.9 KiB
Newer Older
"""
Author: Susheel Bhanu BUSI
Affiliation: ESB group LCSB UniLU
Date: [2019-09-13]
Run: snakemake -s binning_snakefile
Latest modification:
"""

configfile:"config.yaml"

DATA_DIR=config['data_dir']
RESULTS_DIR=config['results_dir']
SAMPLES=config['samples']
# DATABASE=config['database']

###########
rule all:
#    input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/metaspades.bwt"))
#    input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/{reads}/metaspades.bam"), reads=["sr"])
#    input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/{reads}/metaspades.bam"), reads=["sr", "lr"])
#    input: expand(os.path.join(DATA_DIR, "{sample}/dastool_output/{sample}/"), sample=SAMPLES),
#    input: expand(os.path.join(DATA_DIR, "checkm_output/{sample}/lineage.ms"), sample=SAMPLES)

######
# WRAPPER RULES
######
# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
rule MAPPING:
    input:
        expand(os.path.join(DATA_DIR, "metaspades.bwt")),
        expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades.bam")),
        expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades.bam")),
        expand(os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi")),
        expand(os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.bam")),
        expand(os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi")),
        expand(os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.bam")),
        expand(os.path.join(DATA_DIR, "metaspades/{mapper}/merged_bam/metaspades.bam"), mapper=["bwa-mem", "mmi"])
    output: "bwa_and_mmi_mapping.done"
    shell:
        """
        touch {output}
        """

rule BINNING:
    input:
        expand(os.path.join(DATA_DIR, "{sample}/dastool_output/{sample}"), sample=SAMPLES),
        expand(os.path.join(DATA_DIR, "checkm_output/{sample}/lineage.ms"), sample=SAMPLES)
#        expand(os.path.join(DATA_DIR, "gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv"), sample=SAMPLES)

#######################
## Mapping to hybrid ##
#######################
rule build_bwa_index:
    input: "metaspades.fna"
    output:
#           done=touch("bwa_index.done")
           index_amb=os.path.join(DATA_DIR, "{prefix}.amb"),
           index_ann=os.path.join(DATA_DIR, "{prefix}.ann"),
           index_bwt=os.path.join(DATA_DIR, "{prefix}.bwt"),
           index_pac=os.path.join(DATA_DIR, "{prefix}.pac"),
           index_sa=os.path.join(DATA_DIR, "{prefix}.sa")
    log: os.path.join(DATA_DIR, "metaspades/{prefix}.bwa_index")
    conda: "envs/bwa.yaml"
    shell: """
           (date &&\
           bwa index {input} -p {wildcards.prefix} &&\
           date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
           """

rule map_short_reads_to_hybrid_contigs_w_bwa_mem:
    input:
          query_r1="NEB2_MG_S17_R1_001.fastp.fq.gz",
          query_r2="NEB2_MG_S17_R2_001.fastp.fq.gz",
          index_amb=os.path.join(DATA_DIR, "metaspades.amb"),
          index_ann=os.path.join(DATA_DIR, "metaspades.ann"),
          index_bwt=os.path.join(DATA_DIR, "metaspades.bwt"),
          index_pac=os.path.join(DATA_DIR, "metaspades.pac"),
          index_sa=os.path.join(DATA_DIR, "metaspades.sa"),
          index_fa=os.path.join(DATA_DIR, "metaspades.fna")
    output:
          os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades.bam")
    params:
          index_prefix=os.path.join(DATA_DIR, "metaspades"),
          out_prefix=os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades")
    log: os.path.join(DATA_DIR, "metaspades/sr_contigs.bwa_mem_samtools")
    conda: "envs/bwa.yaml"
    threads:
          config["bwa"]["threads"]
    shell: """
           (date &&\
           bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
           date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
           """

rule map_long_reads_to_hybrid_contigs_w_bwa_mem:
    input:
          query_lr="barcode07.fastq.gz",
          index_amb=os.path.join(DATA_DIR, "metaspades.amb"),
          index_ann=os.path.join(DATA_DIR, "metaspades.ann"),
          index_bwt=os.path.join(DATA_DIR, "metaspades.bwt"),
          index_pac=os.path.join(DATA_DIR, "metaspades.pac"),
          index_sa=os.path.join(DATA_DIR, "metaspades.sa"),
          index_fa=os.path.join(DATA_DIR, "metaspades.fna")
    output:
          os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades.bam")
    params:
          index_prefix=os.path.join(DATA_DIR, "metaspades"),
          out_prefix=os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades")
    log:  os.path.join(DATA_DIR, "metaspades/lr_contigs.bwa_mem_samtools")
    conda: "envs/bwa.yaml"
    threads: config["bwa"]["threads"]
    shell: """
          (date &&\
          bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
          date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
          """

rule build_minimap2_long_read_contigs_ont_index:
    input: "metaspades.fna"
    output: os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi")
    conda: "envs/minimap2.yaml"
    shell:
           """
            date
            minimap2 -x map-ont -d {output} {input}
            date
            """

rule map_long_reads_to_long_read_contigs:
    input:
          query="barcode07.fastq.gz",
          index=os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi"),
          index_fa="metaspades.fna"
    output: os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.bam")
    params:
          prefix=os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades")
    conda: "envs/minimap2.yaml"
    threads: config["minimap2"]["threads"]
    log: os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.minimap2_samtools")
    shell: """
            (date &&\
            minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\
            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
            """

rule build_minimap2_long_read_contigs_sr_index:
    input: "metaspades.fna"
    output: os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi")
    conda: "envs/minimap2.yaml"
    shell:
            """
            date
            minimap2 -x sr -d {output} {input}
            date
            """

rule map_short_reads_to_long_read_contigs_w_minimap2:
    input:
          query_r1="NEB2_MG_S17_R1_001.fastp.fq.gz",
          query_r2="NEB2_MG_S17_R2_001.fastp.fq.gz",
          index=os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi"),
          index_fa="metaspades.fna"
    output: os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.bam")
    params:
          prefix=os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades")
    conda: "envs/minimap2.yaml"
    threads: config["minimap2"]["threads"]
    log: os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.minimap2_samtools")
    shell:
           """
            (date &&\
           minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output}
           date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
            """
 
rule merge_bam_files_for_metaspades:
    input:
          bam_sr=os.path.join(DATA_DIR, "metaspades/{mapper}/sr/metaspades.bam"),
          bam_lr=os.path.join(DATA_DIR, "metaspades/{mapper}/lr/metaspades.bam")
    output: os.path.join(DATA_DIR, "metaspades/{mapper}/merged_bam/metaspades.bam")
    conda: "/home/users/sbusi/apps/environments/binning.yml"
    shell:
          """
          (date &&\
          samtools merge {output} {input.bam_sr} {input.bam_lr} &&\
          date)
          """

   
#####################################
########### MAXBIN2 #################
#####################################
rule maxbin2:
    input:
          bbref="{datadir}/{sample}.fna",
          bbam="{datadir}/{sample}/{sample}.bam"
    threads:config["threads"]
    conda:"/home/users/sbusi/apps/environments/binning.yml"
    output:
           map="{datadir}/{sample}/{sample}.sam",
           coverage="{datadir}/{sample}/{sample}_cov.txt",
           abund="{datadir}/{sample}/{sample}_abundance.txt",
           file="{datadir}/{sample}/maxbin_output/maxbin_output.001.fasta"
#        mx=directory("{datadir}/{sample}/maxbin_output/")
    shell:
           """
            date
            samtools view -h -o {output.map} {input.bbam}
            pileup.sh in={output.map} out={output.coverage}
            awk '{{print $1"\\t"$2}}' {output.coverage} | grep -v '^#' > {output.abund}
            run_MaxBin.pl -thread {threads} -contig {input.bbref} -out {wildcards.datadir}/{wildcards.sample}/maxbin_output/maxbin_output -abund {output.abund}
            touch maxbin.done
            date
            """


#####################################
########### METABAT2 ################
#####################################

rule depth_files:
    input:
          dep1="{datadir}/{sample}/{sample}.bam"
    threads:config["threads"]
    conda:"/home/users/sbusi/apps/environments/binning.yml"
    output:
          depout="{datadir}/{sample}/{sample}_depth.txt"
    shell:
           """
            date
            jgi_summarize_bam_contig_depths --outputDepth {output.depout} {input.dep1}
            touch depth_file.done 
            date
            """

rule metabat2:
    input:
           met1="{datadir}/{sample}.fna",
           met2="{datadir}/{sample}/{sample}_depth.txt"
    threads:config["threads"]
    conda:"/home/users/sbusi/apps/environments/binning.yml"
    output:
           outdir=directory("{datadir}/{sample}/metabat_output/")
    shell:
           """
            date
            metabat2 -i {input.met1} -a {input.met2} -o {wildcards.datadir}/{wildcards.sample}/metabat_output/{wildcards.sample}.metabat -t {threads} -m 1500 -v --unbinned
            touch metabat2.done
            date
            """

#####################################
########### DAS_Tool ################
#####################################
rule scaffold_list:
    input:
        sca1="{datadir}/{sample}/metabat_output/",
#        sca2="{datadir}/{sample}/maxbin_output/"
        file = "{datadir}/{sample}/maxbin_output/maxbin_output.001.fasta"
    threads:config["threads"]
    conda:"/home/users/sbusi/apps/environments/base.yml"
    output:
            scout1="{datadir}/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv",
            scout2="{datadir}/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv"
    shell:
            """
            date
            Fasta_to_Scaffolds2Bin.sh -i {input.sca1} -e fa > {output.scout1}
            Fasta_to_Scaffolds2Bin.sh -i {wildcards.datadir}/{wildcards.sample}/maxbin_output -e fasta > {output.scout2}
            date
            touch scaffold_list.done
            """

rule DAS_Tool:
    input:
            da1="{datadir}/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv",
            da2="{datadir}/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv",
            da3="{datadir}/{sample}.fna",
            db=config["dastool_database"]
    threads:config["threads"]
    conda:"/home/users/sbusi/apps/environments/base.yml"
    output:
            daout=directory("{datadir}/{sample}/dastool_output/{sample}")
    shell:
            """
            date
            DAS_Tool -i {input.da1},{input.da2} -c {input.da3} -o {output.daout} --search_engine diamond -l maxbin2,metabat2 --write_bins 1 --write_bin_evals 1 --threads {threads} --db_directory {input.db} --create_plots 1 &&\
            2> >(tee {log}.stderr) > >(tee {log}.stdout)
            touch das_tool.done
            date
            """
#checkpoint collect_bins:
#    input:
#        col1="{datadir}/{sample}/dastool_output"
#        col1="{datadir}/{sample}/dastool_output/{file_i}.fa"
#    output:
#        colout="{datadir}/{sample}/dastool_output/high_quality_bins/{file_i}.fa"
#    shell:    """
#        date
#        cp {input.col1} {output.colout}
#        date
#        """

#####################################
######## GTDBTK & CHECKM ############
#####################################

rule gtdbtk:
    input:
           gen="{datadir}/{sample}/dastool_output/{sample}/"
    threads:config["threads"]
    conda:"gtdbtk.yml"
    output:
           gtdb=directory("{datadir}/gtdbtk_output/{sample}")
    shell:"""
           date
           gtdbtk classify_wf --cpus {threads} -x fa --genome_dir {input.gen} --out_dir {output.gtdb}
           touch gtdbtk.done
           date
           """

rule checkm:
    input:
           chkm="{datadir}/{sample}/dastool_output/{sample}/"
    threads:config["threads"]
    conda:"checkm.yml"
    output:
           chkmdir=directory("{datadir}/checkm_output/{sample}/lineage.ms")
    shell:
           """
           date
           checkm lineage_wf -r -t {threads} -x fa {input.chkm} {output.chkmdir}
           touch checkm.done
           date
           """

rule plot_checkm:
    input:
           pl1="{datadir}/checkm_output/{sample}",
           pl2="{datadir}/{sample}/dastool_output/high_quality_bins"
    threads:config["threads"]
    conda:"/home/users/sbusi/apps/environments/checkm.yml"
    output:
           plt=directory("{datadir}/checkm_output/{sample}/plots")
    shell:
           """
           date
           checkm bin_qa_plot --image_type png --width 8 --dpi 200 --font_size 8 -x fa -q {input.pl1} {input.pl2} {output.plt}
           touch plot_checkm.done
           date
           """