diff --git a/config/config.yaml b/config/config.yaml index e969a89042eb2154be2741a71ea77ae551f374ab..9a0154e1177afc0020147e0d6b5b9fad99f38481 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,7 +3,7 @@ # Pipeline steps # steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"] -steps: ["preprocessing"] +steps: ["preprocessing", "assembly"] # Analysis sub-steps analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"] @@ -87,6 +87,7 @@ assemblers: sr: ["megahit", "metaspades"] lr: ["flye"] hy: ["metaspadeshybrid"] + # hy: ["metaspadeshybrid", "operams"] flye: threads: 10 @@ -98,6 +99,11 @@ metaspades: megahit: threads: 10 +operams: + threads: 10 + # bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl" + bin: "/home/users/sbusi/apps/miniconda3/envs/operams/OPERA-MS/OPERA-MS.pl" + # Mapping bwa: threads: 10 @@ -113,7 +119,7 @@ samtools: # Polishing medaka: - threads: 10 + threads: 10 # do NOT set to large value (e.g. using 30 did not work) model: r941_min_high # the MinION model, high accuarcy racon: @@ -160,9 +166,6 @@ racon: # threads: 27 # genome_size: "1g" -# operams: -# bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl" -# threads: 28 # megahit: # threads: 28 diff --git a/workflow/Snakefile b/workflow/Snakefile index 3bba6508a478db0e811dbb7db62013c2f22829fe..26fa597a69fa64fab9495ac9318cf60cb11162b3 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -86,16 +86,15 @@ if "preprocessing" in STEPS: "status/preprocessing_sr.done" ] -# # Assembly -# if "assembly" in STEPS: -# include: -# "steps/assembly.smk" -# TARGETS += [ -# "status/assembly.done", -# # "status/polishing.done", -# "status/mapping.done", -# "status/coverage.done" -# ] +# Assembly +if "assembly" in STEPS: + include: + "steps/assembly.smk" + TARGETS += [ + "status/assembly.done", + "status/mapping.done", + "status/coverage.done" + ] # # Assembly annotation # if 'assembly_annotation' in STEPS: diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index 987a384b2730250daa69f9f3b9d0e2dba4a2b119..fd4f2bafbf51ab1eae76771123ed22468569c351 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -3,7 +3,7 @@ # Long reads rule assembly_lr_flye: input: - os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz") + os.path.join(RESULTS_DIR, "preproc/metag/lr/lr.fastq.gz") output: # NOTE: NOT the name for "final" FASTA file which will be created AFTER polishing os.path.join(RESULTS_DIR, "assembly/lr/flye/assembly.fasta") @@ -23,9 +23,9 @@ rule assembly_lr_flye: rule polishing_lr_racon: input: - lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"), + lr=os.path.join(RESULTS_DIR, "preproc/metag/lr/lr.fastq.gz"), asm=os.path.join(RESULTS_DIR, "assembly/lr/{tool}/assembly.fasta"), - bam=os.path.join(RESULTS_DIR, "mapping/lr/{tool}/polishing/assembly_lr.bam") + bam=os.path.join(RESULTS_DIR, "mapping/metag/lr/{tool}/polishing/assembly_lr.bam") output: fasta=os.path.join(RESULTS_DIR, "assembly/lr/{tool}/polishing/racon/assembly.fasta"), # NOTE: Tmp files for input data to pass file extension checks @@ -49,10 +49,10 @@ rule polishing_lr_racon: rule polishing_lr_medaka: input: - lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"), + lr=os.path.join(RESULTS_DIR, "preproc/metag/lr/lr.fastq.gz"), asm=os.path.join(RESULTS_DIR, "assembly/lr/{tool}/polishing/racon/assembly.fasta") output: - os.path.join(RESULTS_DIR, "assembly/lr/{tool}/polishing/racon_medaka/assembly.fasta") + os.path.join(RESULTS_DIR, "assembly/lr/{tool}/polishing/racon_medaka/consensus.fasta") log: out="logs/polishing_lr.{tool}.racon.medaka.out.log", err="logs/polishing_lr.{tool}.racon.medaka.err.log" @@ -72,18 +72,17 @@ localrules: assembly_lr_flye_link rule assembly_lr_flye_link: input: - os.path.join(RESULTS_DIR, "assembly/lr/{tool}/polishing/racon_medaka/assembly.fasta") + os.path.join(RESULTS_DIR, "assembly/lr/{tool}/polishing/racon_medaka/consensus.fasta") output: os.path.join(RESULTS_DIR, "assembly/lr/{tool}/ASSEMBLY.fasta") shell: "cd $(dirname {output}) && ln -sf $(realpath --relative-to=$(basename {output}) {input}) $(basename {output})" - # Short reads rule assembly_sr_megahit: input: - r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"), - r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz") + r1=os.path.join(RESULTS_DIR, "preproc/metag/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/metag/sr/R2.fastp.fastq.gz") output: os.path.join(RESULTS_DIR, "assembly/sr/megahit/ASSEMBLY.fasta") log: @@ -104,8 +103,8 @@ rule assembly_sr_megahit: rule assembly_sr_metaspades: input: - r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"), - r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz") + r1=os.path.join(RESULTS_DIR, "preproc/metag/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/metag/sr/R2.fastp.fastq.gz") output: os.path.join(RESULTS_DIR, "assembly/sr/metaspades/ASSEMBLY.fasta") log: @@ -124,9 +123,9 @@ rule assembly_sr_metaspades: # Hybrid rule assemble_hy_metaspades: input: - lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"), - r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"), - r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz") + lr=os.path.join(RESULTS_DIR, "preproc/metag/lr/lr.fastq.gz"), + r1=os.path.join(RESULTS_DIR, "preproc/metag/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/metag/sr/R2.fastp.fastq.gz") output: os.path.join(RESULTS_DIR, "assembly/hy/metaspadeshybrid/ASSEMBLY.fasta") log: diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 5ece1d00d89f4a300f1f6e36e9489fb4c4d3b4a8..26e8f7e4a6a46a842d79850b9fcee4f22cd7ec7f 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -7,8 +7,8 @@ rule bwa_index_assembly_polishing: output: temp(expand(os.path.join(RESULTS_DIR, "assembly/lr/{{tool}}/polishing/assembly.{ext}"), ext=BWA_IDX_EXT)) log: - out="logs/mapping_bwa_idx_lr.{tool}.polishing.out.log", - err="logs/mapping_bwa_idx_lr.{tool}.polishing.err.log" + out="logs/bwa_index.polishing.{tool}.out.log", + err="logs/bwa_index.polishing.{tool}.err.log" wildcard_constraints: tool="|".join(config["assemblers"]["lr"]) threads: 1 @@ -21,17 +21,14 @@ rule bwa_index_assembly_polishing: rule bwa_mem_assembly_polishing: input: - # reads - lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"), - # assembly + lr=os.path.join(RESULTS_DIR, "preproc/metag/lr/lr.fastq.gz"), asm=os.path.join(RESULTS_DIR, "assembly/lr/{tool}/assembly.fasta"), - # assembly index idx=expand(os.path.join(RESULTS_DIR, "assembly/lr/{{tool}}/polishing/assembly.{ext}"), ext=BWA_IDX_EXT) output: - temp(os.path.join(RESULTS_DIR, "mapping/lr/{tool}/polishing/assembly_lr.bam")) + temp(os.path.join(RESULTS_DIR, "mapping/metag/lr/{tool}/polishing/assembly_lr.bam")) log: - out="logs/mapping_bwa_mem_lr.{tool}.lr.polishing.out.log", - err="logs/mapping_bwa_mem_lr.{tool}.lr.polishing.err.log" + out="logs/bwa_mem.polishing.metag.lr.{tool}.out.log", + err="logs/bwa_mem.polishing.metag.lr.{tool}.err.log" wildcard_constraints: tool="|".join(config["assemblers"]["lr"]) params: @@ -56,11 +53,11 @@ rule bwa_index_assembly: output: expand(os.path.join(RESULTS_DIR, "assembly/{{rtype}}/{{tool}}/ASSEMBLY.{ext}"), ext=BWA_IDX_EXT) log: - out="logs/mapping_bwa_idx_{rtype}.{tool}.out.log", - err="logs/mapping_bwa_idx_{rtype}.{tool}.err.log" + out="logs/bwa_index.{rtype}.{tool}.out.log", + err="logs/bwa_index.{rtype}.{tool}.err.log" wildcard_constraints: - rtype="|".join(config["assemblers"].keys()), - tool="|".join(["|".join(a) for a in config["assemblers"].values()]) + rtype="|".join(READ_TYPES), + tool="|".join(ASSEMBLERS) threads: 1 params: idx_prefix=lambda wildcards, output: os.path.splitext(output[0])[0] @@ -73,20 +70,21 @@ rule bwa_index_assembly: rule bwa_mem_assembly_sr: input: # reads - r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"), - r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz"), + r1=os.path.join(RESULTS_DIR, "preproc/{mtype}/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/{mtype}/sr/R2.fastp.fastq.gz"), # assembly asm=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.fasta"), # assembly index idx=expand(os.path.join(RESULTS_DIR, "assembly/{{rtype}}/{{tool}}/ASSEMBLY.{ext}"), ext=BWA_IDX_EXT) output: - os.path.join(RESULTS_DIR, "mapping/{rtype}/{tool}/ASSEMBLY_sr.bam") + os.path.join(RESULTS_DIR, "mapping/{mtype}/{rtype}/{tool}/ASSEMBLY_sr.bam") log: - out="logs/mapping_bwa_mem_{rtype}.{tool}.sr.out.log", - err="logs/mapping_bwa_mem_{rtype}.{tool}.sr.err.log" + out="logs/bwa_mem.{mtype}.{rtype}.{tool}.sr.out.log", + err="logs/bwa_mem.{mtype}.{rtype}.{tool}.sr.err.log" wildcard_constraints: - rtype="|".join(["sr", "hy"]), - tool="|".join(config["assemblers"]["sr"] + config["assemblers"]["hy"]) + mtype="|".join(META_TYPES), + rtype="|".join(READ_TYPES), + tool="|".join(ASSEMBLERS) params: idx_prefix=lambda wildcards, input: os.path.splitext(input.idx[0])[0], bam_prefix=lambda wildcards, output: os.path.splitext(output[0])[0] @@ -105,16 +103,16 @@ rule bwa_mem_assembly_sr: rule bwa_mem_assembly_lr: input: # reads - lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"), + lr=os.path.join(RESULTS_DIR, "preproc/metag/lr/lr.fastq.gz"), # assembly asm=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.fasta"), # assembly index idx=expand(os.path.join(RESULTS_DIR, "assembly/{{rtype}}/{{tool}}/ASSEMBLY.{ext}"), ext=BWA_IDX_EXT) output: - os.path.join(RESULTS_DIR, "mapping/{rtype}/{tool}/ASSEMBLY_lr.bam") + os.path.join(RESULTS_DIR, "mapping/metag/{rtype}/{tool}/ASSEMBLY_lr.bam") log: - out="logs/mapping_bwa_mem_{rtype}.{tool}.lr.out.log", - err="logs/mapping_bwa_mem_{rtype}.{tool}.lr.err.log" + out="logs/bwa_mem.metag.{rtype}.{tool}.lr.out.log", + err="logs/bwa_mem.metag.{rtype}.{tool}.lr.err.log" wildcard_constraints: rtype="|".join(["lr", "hy"]), tool="|".join(config["assemblers"]["lr"] + config["assemblers"]["hy"]) @@ -135,13 +133,13 @@ rule bwa_mem_assembly_lr: # Hybrid: short + long reads rule bwa_mem_assembly_hy: input: - sr=os.path.join(RESULTS_DIR, "mapping/hy/{tool}/ASSEMBLY_sr.bam"), - lr=os.path.join(RESULTS_DIR, "mapping/hy/{tool}/ASSEMBLY_lr.bam") + sr=os.path.join(RESULTS_DIR, "mapping/metag/hy/{tool}/ASSEMBLY_sr.bam"), + lr=os.path.join(RESULTS_DIR, "mapping/metag/hy/{tool}/ASSEMBLY_lr.bam") output: - os.path.join(RESULTS_DIR, "mapping/hy/{tool}/ASSEMBLY_hy.bam") + os.path.join(RESULTS_DIR, "mapping/metag/hy/{tool}/ASSEMBLY_hy.bam") log: - out="logs/mapping_bwa_mem_hy.{tool}.hy.out.log", - err="logs/mapping_bwa_mem_hy.{tool}.hy.err.log" + out="logs/bwa_mem.metag.hy.{tool}.hy.out.log", + err="logs/bwa_mem.metag.hy.{tool}.hy.err.log" wildcard_constraints: tool="|".join(config["assemblers"]["hy"]) threads: 1 @@ -154,18 +152,17 @@ rule bwa_mem_assembly_hy: # Compute the genome coverage histogram - https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html rule assembly_genomecov: input: - os.path.join(RESULTS_DIR, "mapping/{rtype}/{tool}/ASSEMBLY_{rtype}.bam") + os.path.join(RESULTS_DIR, "mapping/{mtype}/{rtype1}/{tool}/ASSEMBLY_{rtype2}.bam") output: - os.path.join(RESULTS_DIR, "mapping/{rtype}/{tool}/ASSEMBLY_{rtype}.cov.txt") + os.path.join(RESULTS_DIR, "mapping/{mtype}/{rtype1}/{tool}/ASSEMBLY_{rtype2}.cov.txt") log: - out="logs/mapping_cov_{rtype}.{tool}.{rtype}.out.log", - err="logs/mapping_cov_{rtype}.{tool}.{rtype}.err.log" + out="logs/bedtools_genomecov.{mtype}.{rtype1}.{tool}.{rtype2}.out.log", + err="logs/bedtools_genomecov.{mtype}.{rtype1}.{tool}.{rtype2}.err.log" wildcard_constraints: - rtype="|".join(config["assemblers"].keys()), - tool="|".join([ - "|".join(config["assemblers"]["sr"] + config["assemblers"]["hy"]), - "|".join(["%s(_.+)?" % a for a in config["assemblers"]["lr"]]) - ]) + mtype="|".join(META_TYPES), + rtype1="|".join(READ_TYPES), + rtype2="|".join(READ_TYPES), + tool="|".join(ASSEMBLERS) threads: 1 conda: os.path.join(ENV_DIR, "bedtools.yaml") @@ -174,18 +171,17 @@ rule assembly_genomecov: rule assembly_genomecov_average: input: - os.path.join(RESULTS_DIR, "mapping/{rtype}/{tool}/ASSEMBLY_{rtype}.cov.txt") + os.path.join(RESULTS_DIR, "mapping/{mtype}/{rtype1}/{tool}/ASSEMBLY_{rtype2}.cov.txt") output: - os.path.join(RESULTS_DIR, "mapping/{rtype}/{tool}/ASSEMBLY_{rtype}.cov.ave.txt") + os.path.join(RESULTS_DIR, "mapping/{mtype}/{rtype1}/{tool}/ASSEMBLY_{rtype2}.cov.ave.txt") log: - out="logs/mapping_cov_{rtype}.{tool}.{rtype}.out.log", - err="logs/mapping_cov_{rtype}.{tool}.{rtype}.err.log" + out="logs/ave_genomecov.{mtype}.{rtype1}.{tool}.{rtype2}.out.log", + err="logs/ave_genomecov.{mtype}.{rtype1}.{tool}.{rtype2}.err.log" wildcard_constraints: - rtype="|".join(config["assemblers"].keys()), - tool="|".join([ - "|".join(config["assemblers"]["sr"] + config["assemblers"]["hy"]), - "|".join(["%s(_.+)?" % a for a in config["assemblers"]["lr"]]) - ]) + mtype="|".join(META_TYPES), + rtype1="|".join(READ_TYPES), + rtype2="|".join(READ_TYPES), + tool="|".join(ASSEMBLERS) params: script=os.path.join(SRC_DIR, "coverage.awk") threads: 1 diff --git a/workflow/steps/assembly.smk b/workflow/steps/assembly.smk index 354e28ffc74b0e50cec54ca7f3bd89db8883464e..e718d6383ad257d2dcf930cc3ce608091e5eb741 100644 --- a/workflow/steps/assembly.smk +++ b/workflow/steps/assembly.smk @@ -22,9 +22,12 @@ rule ASSEMBLY: rule MAPPING: input: - lr=expand(os.path.join(RESULTS_DIR, "mapping/lr/{tool}/ASSEMBLY_lr.bam"), tool=config["assemblers"]["lr"]), - sr=expand(os.path.join(RESULTS_DIR, "mapping/sr/{tool}/ASSEMBLY_sr.bam"), tool=config["assemblers"]["sr"]), - hy=expand(os.path.join(RESULTS_DIR, "mapping/hy/{tool}/ASSEMBLY_hy.bam"), tool=config["assemblers"]["hy"]), + metag_lr=expand(os.path.join(RESULTS_DIR, "mapping/metag/lr/{tool}/ASSEMBLY_lr.bam"), tool=config["assemblers"]["lr"]), + metag_sr=expand(os.path.join(RESULTS_DIR, "mapping/metag/sr/{tool}/ASSEMBLY_sr.bam"), tool=config["assemblers"]["sr"]), + metag_hy=expand(os.path.join(RESULTS_DIR, "mapping/metag/hy/{tool}/ASSEMBLY_hy.bam"), tool=config["assemblers"]["hy"]), + metat_lr=expand(os.path.join(RESULTS_DIR, "mapping/metat/lr/{tool}/ASSEMBLY_sr.bam"), tool=config["assemblers"]["lr"]), + metat_sr=expand(os.path.join(RESULTS_DIR, "mapping/metat/sr/{tool}/ASSEMBLY_sr.bam"), tool=config["assemblers"]["sr"]), + metat_hy=expand(os.path.join(RESULTS_DIR, "mapping/metat/hy/{tool}/ASSEMBLY_sr.bam"), tool=config["assemblers"]["hy"]) output: "status/mapping.done" shell: @@ -32,9 +35,12 @@ rule MAPPING: rule COVERAGE: input: - lr=expand(os.path.join(RESULTS_DIR, "mapping/lr/{tool}/ASSEMBLY_lr.cov.ave.txt"), tool=config["assemblers"]["lr"]), - sr=expand(os.path.join(RESULTS_DIR, "mapping/sr/{tool}/ASSEMBLY_sr.cov.ave.txt"), tool=config["assemblers"]["sr"]), - hy=expand(os.path.join(RESULTS_DIR, "mapping/hy/{tool}/ASSEMBLY_hy.cov.ave.txt"), tool=config["assemblers"]["hy"]) + metag_lr=expand(os.path.join(RESULTS_DIR, "mapping/metag/lr/{tool}/ASSEMBLY_lr.cov.ave.txt"), tool=config["assemblers"]["lr"]), + metag_sr=expand(os.path.join(RESULTS_DIR, "mapping/metag/sr/{tool}/ASSEMBLY_sr.cov.ave.txt"), tool=config["assemblers"]["sr"]), + metag_hy=expand(os.path.join(RESULTS_DIR, "mapping/metag/hy/{tool}/ASSEMBLY_hy.cov.ave.txt"), tool=config["assemblers"]["hy"]), + metat_lr=expand(os.path.join(RESULTS_DIR, "mapping/metat/lr/{tool}/ASSEMBLY_sr.cov.ave.txt"), tool=config["assemblers"]["lr"]), + metat_sr=expand(os.path.join(RESULTS_DIR, "mapping/metat/sr/{tool}/ASSEMBLY_sr.cov.ave.txt"), tool=config["assemblers"]["sr"]), + metat_hy=expand(os.path.join(RESULTS_DIR, "mapping/metat/hy/{tool}/ASSEMBLY_sr.cov.ave.txt"), tool=config["assemblers"]["hy"]) output: "status/coverage.done" shell: