From bf7f5ad2cc63be954c1ff8e71b440459f5383109 Mon Sep 17 00:00:00 2001 From: "Susheel Bhanu Busi susheel.busi@uni.lu" <susheel.busi@uni.lu> Date: Mon, 20 Apr 2020 22:02:12 +0200 Subject: [PATCH] use this for Binning FLYE and MEGAHIT assemblies. To be incorporated into the original Snakefile --- snakefile | 351 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100755 snakefile diff --git a/snakefile b/snakefile new file mode 100755 index 0000000..1fb685e --- /dev/null +++ b/snakefile @@ -0,0 +1,351 @@ +""" +Author: Susheel Bhanu BUSI +Affiliation: ESB group LCSB UniLU +Date: [2019-09-13] +Run: snakemake -s binning_snakefile +Latest modification: +""" + +configfile:"config.yaml" + +DATA_DIR=config['data_dir'] +RESULTS_DIR=config['results_dir'] +SAMPLES=config['samples'] +# DATABASE=config['database'] + +########### +rule all: +# input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/metaspades.bwt")) +# input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/{reads}/metaspades.bam"), reads=["sr"]) +# input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/{reads}/metaspades.bam"), reads=["sr", "lr"]) +# input: expand(os.path.join(DATA_DIR, "{sample}/dastool_output/{sample}/"), sample=SAMPLES), +# input: expand(os.path.join(DATA_DIR, "checkm_output/{sample}/lineage.ms"), sample=SAMPLES) + +###### +# WRAPPER RULES +###### +# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule. +rule MAPPING: + input: + expand(os.path.join(DATA_DIR, "metaspades.bwt")), + expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades.bam")), + expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades.bam")), + expand(os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi")), + expand(os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.bam")), + expand(os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi")), + expand(os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.bam")), + expand(os.path.join(DATA_DIR, "metaspades/{mapper}/merged_bam/metaspades.bam"), mapper=["bwa-mem", "mmi"]) + output: "bwa_and_mmi_mapping.done" + shell: + """ + touch {output} + """ + +rule BINNING: + input: + expand(os.path.join(DATA_DIR, "{sample}/dastool_output/{sample}"), sample=SAMPLES), + expand(os.path.join(DATA_DIR, "checkm_output/{sample}/lineage.ms"), sample=SAMPLES) +# expand(os.path.join(DATA_DIR, "gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv"), sample=SAMPLES) + +####################### +## Mapping to hybrid ## +####################### +rule build_bwa_index: + input: "metaspades.fna" + output: +# done=touch("bwa_index.done") + index_amb=os.path.join(DATA_DIR, "{prefix}.amb"), + index_ann=os.path.join(DATA_DIR, "{prefix}.ann"), + index_bwt=os.path.join(DATA_DIR, "{prefix}.bwt"), + index_pac=os.path.join(DATA_DIR, "{prefix}.pac"), + index_sa=os.path.join(DATA_DIR, "{prefix}.sa") + log: os.path.join(DATA_DIR, "metaspades/{prefix}.bwa_index") + conda: "envs/bwa.yaml" + shell: """ + (date &&\ + bwa index {input} -p {wildcards.prefix} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule map_short_reads_to_hybrid_contigs_w_bwa_mem: + input: + query_r1="NEB2_MG_S17_R1_001.fastp.fq.gz", + query_r2="NEB2_MG_S17_R2_001.fastp.fq.gz", + index_amb=os.path.join(DATA_DIR, "metaspades.amb"), + index_ann=os.path.join(DATA_DIR, "metaspades.ann"), + index_bwt=os.path.join(DATA_DIR, "metaspades.bwt"), + index_pac=os.path.join(DATA_DIR, "metaspades.pac"), + index_sa=os.path.join(DATA_DIR, "metaspades.sa"), + index_fa=os.path.join(DATA_DIR, "metaspades.fna") + output: + os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades.bam") + params: + index_prefix=os.path.join(DATA_DIR, "metaspades"), + out_prefix=os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades") + log: os.path.join(DATA_DIR, "metaspades/sr_contigs.bwa_mem_samtools") + conda: "envs/bwa.yaml" + threads: + config["bwa"]["threads"] + shell: """ + (date &&\ + bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule map_long_reads_to_hybrid_contigs_w_bwa_mem: + input: + query_lr="barcode07.fastq.gz", + index_amb=os.path.join(DATA_DIR, "metaspades.amb"), + index_ann=os.path.join(DATA_DIR, "metaspades.ann"), + index_bwt=os.path.join(DATA_DIR, "metaspades.bwt"), + index_pac=os.path.join(DATA_DIR, "metaspades.pac"), + index_sa=os.path.join(DATA_DIR, "metaspades.sa"), + index_fa=os.path.join(DATA_DIR, "metaspades.fna") + output: + os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades.bam") + params: + index_prefix=os.path.join(DATA_DIR, "metaspades"), + out_prefix=os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades") + log: os.path.join(DATA_DIR, "metaspades/lr_contigs.bwa_mem_samtools") + conda: "envs/bwa.yaml" + threads: config["bwa"]["threads"] + shell: """ + (date &&\ + bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule build_minimap2_long_read_contigs_ont_index: + input: "metaspades.fna" + output: os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi") + conda: "envs/minimap2.yaml" + shell: + """ + date + minimap2 -x map-ont -d {output} {input} + date + """ + +rule map_long_reads_to_long_read_contigs: + input: + query="barcode07.fastq.gz", + index=os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi"), + index_fa="metaspades.fna" + output: os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.bam") + params: + prefix=os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades") + conda: "envs/minimap2.yaml" + threads: config["minimap2"]["threads"] + log: os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.minimap2_samtools") + shell: """ + (date &&\ + minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule build_minimap2_long_read_contigs_sr_index: + input: "metaspades.fna" + output: os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi") + conda: "envs/minimap2.yaml" + shell: + """ + date + minimap2 -x sr -d {output} {input} + date + """ + +rule map_short_reads_to_long_read_contigs_w_minimap2: + input: + query_r1="NEB2_MG_S17_R1_001.fastp.fq.gz", + query_r2="NEB2_MG_S17_R2_001.fastp.fq.gz", + index=os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi"), + index_fa="metaspades.fna" + output: os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.bam") + params: + prefix=os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades") + conda: "envs/minimap2.yaml" + threads: config["minimap2"]["threads"] + log: os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.minimap2_samtools") + shell: + """ + (date &&\ + minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule merge_bam_files_for_metaspades: + input: + bam_sr=os.path.join(DATA_DIR, "metaspades/{mapper}/sr/metaspades.bam"), + bam_lr=os.path.join(DATA_DIR, "metaspades/{mapper}/lr/metaspades.bam") + output: os.path.join(DATA_DIR, "metaspades/{mapper}/merged_bam/metaspades.bam") + conda: "/home/users/sbusi/apps/environments/binning.yml" + shell: + """ + (date &&\ + samtools merge {output} {input.bam_sr} {input.bam_lr} &&\ + date) + """ + + +##################################### +########### MAXBIN2 ################# +##################################### +rule maxbin2: + input: + bbref="{datadir}/{sample}.fna", + bbam="{datadir}/{sample}/{sample}.bam" + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/binning.yml" + output: + map="{datadir}/{sample}/{sample}.sam", + coverage="{datadir}/{sample}/{sample}_cov.txt", + abund="{datadir}/{sample}/{sample}_abundance.txt", + file="{datadir}/{sample}/maxbin_output/maxbin_output.001.fasta" +# mx=directory("{datadir}/{sample}/maxbin_output/") + shell: + """ + date + samtools view -h -o {output.map} {input.bbam} + pileup.sh in={output.map} out={output.coverage} + awk '{{print $1"\\t"$2}}' {output.coverage} | grep -v '^#' > {output.abund} + run_MaxBin.pl -thread {threads} -contig {input.bbref} -out {wildcards.datadir}/{wildcards.sample}/maxbin_output/maxbin_output -abund {output.abund} + touch maxbin.done + date + """ + + +##################################### +########### METABAT2 ################ +##################################### + +rule depth_files: + input: + dep1="{datadir}/{sample}/{sample}.bam" + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/binning.yml" + output: + depout="{datadir}/{sample}/{sample}_depth.txt" + shell: + """ + date + jgi_summarize_bam_contig_depths --outputDepth {output.depout} {input.dep1} + touch depth_file.done + date + """ + +rule metabat2: + input: + met1="{datadir}/{sample}.fna", + met2="{datadir}/{sample}/{sample}_depth.txt" + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/binning.yml" + output: + outdir=directory("{datadir}/{sample}/metabat_output/") + shell: + """ + date + metabat2 -i {input.met1} -a {input.met2} -o {wildcards.datadir}/{wildcards.sample}/metabat_output/{wildcards.sample}.metabat -t {threads} -m 1500 -v --unbinned + touch metabat2.done + date + """ + +##################################### +########### DAS_Tool ################ +##################################### +rule scaffold_list: + input: + sca1="{datadir}/{sample}/metabat_output/", +# sca2="{datadir}/{sample}/maxbin_output/" + file = "{datadir}/{sample}/maxbin_output/maxbin_output.001.fasta" + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/base.yml" + output: + scout1="{datadir}/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv", + scout2="{datadir}/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv" + shell: + """ + date + Fasta_to_Scaffolds2Bin.sh -i {input.sca1} -e fa > {output.scout1} + Fasta_to_Scaffolds2Bin.sh -i {wildcards.datadir}/{wildcards.sample}/maxbin_output -e fasta > {output.scout2} + date + touch scaffold_list.done + """ + +rule DAS_Tool: + input: + da1="{datadir}/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv", + da2="{datadir}/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv", + da3="{datadir}/{sample}.fna", + db=config["dastool_database"] + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/base.yml" + output: + daout=directory("{datadir}/{sample}/dastool_output/{sample}") + shell: + """ + date + DAS_Tool -i {input.da1},{input.da2} -c {input.da3} -o {output.daout} --search_engine diamond -l maxbin2,metabat2 --write_bins 1 --write_bin_evals 1 --threads {threads} --db_directory {input.db} --create_plots 1 &&\ + 2> >(tee {log}.stderr) > >(tee {log}.stdout) + touch das_tool.done + date + """ +#checkpoint collect_bins: +# input: +# col1="{datadir}/{sample}/dastool_output" +# col1="{datadir}/{sample}/dastool_output/{file_i}.fa" +# output: +# colout="{datadir}/{sample}/dastool_output/high_quality_bins/{file_i}.fa" +# shell: """ +# date +# cp {input.col1} {output.colout} +# date +# """ + +##################################### +######## GTDBTK & CHECKM ############ +##################################### + +rule gtdbtk: + input: + gen="{datadir}/{sample}/dastool_output/{sample}/" + threads:config["threads"] + conda:"gtdbtk.yml" + output: + gtdb=directory("{datadir}/gtdbtk_output/{sample}") + shell:""" + date + gtdbtk classify_wf --cpus {threads} -x fa --genome_dir {input.gen} --out_dir {output.gtdb} + touch gtdbtk.done + date + """ + +rule checkm: + input: + chkm="{datadir}/{sample}/dastool_output/{sample}/" + threads:config["threads"] + conda:"checkm.yml" + output: + chkmdir=directory("{datadir}/checkm_output/{sample}/lineage.ms") + shell: + """ + date + checkm lineage_wf -r -t {threads} -x fa {input.chkm} {output.chkmdir} + touch checkm.done + date + """ + +rule plot_checkm: + input: + pl1="{datadir}/checkm_output/{sample}", + pl2="{datadir}/{sample}/dastool_output/high_quality_bins" + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/checkm.yml" + output: + plt=directory("{datadir}/checkm_output/{sample}/plots") + shell: + """ + date + checkm bin_qa_plot --image_type png --width 8 --dpi 200 --font_size 8 -x fa -q {input.pl1} {input.pl2} {output.plt} + touch plot_checkm.done + date + """ -- GitLab