updated/added messages to rules; rgi db is loaded in sep. rule

10e83e92 · Valentina Galata · ecd585c8 · 10e83e92 · 10e83e92 · 10e83e92
Commit 10e83e92 authored 4 years ago by Valentina Galata
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -18,7 +18,7 @@ rule annotation_prodigal:
    conda:
        os.path.join(ENV_DIR, "prodigal.yaml")
    message:
-        "Call genes w/ Prodigal: {input}"
+        "Annotation: call genes w/ Prodigal"
    shell:
        "(date && prodigal -a {output} -p meta -i {input} && date) 2> {log.err} > {log.out}"

@@ -42,7 +42,7 @@ rule annotation_diamond_lr_daa:
    conda:
        os.path.join(ENV_DIR, "diamond.yaml")
    message:
-        "DIAMOND: blastp proteins {input.faa} to {input.db}"
+        "Annotation: protein search w/ DIAMOND in long reads"
    shell: 
        "(date && diamond blastx -q {input.reads} --db {input.db} --out {output} -p {threads} --long-reads --sensitive --outfmt 100 && date) 2> {log.err} > {log.out}"

@@ -65,7 +65,7 @@ rule annotation_diamond_daa:
    conda:
        os.path.join(ENV_DIR, "diamond.yaml")
    message:
-        "DIAMOND: blastp proteins {input.faa} to {input.db}"
+        "Annotation: protein search w/ DIAMOND"
    shell: 
        "(date && diamond blastp -q {input.faa} --db {input.db} --out {output} -p {threads} --outfmt 100 && date) 2> {log.err} > {log.out}"

@@ -87,7 +87,7 @@ rule annotation_diamond_tsv:
    conda:
        os.path.join(ENV_DIR, "diamond.yaml")
    message:
-        "DIAMOND: reformat {input} to {output}"
+        "Annotation: reformat DIAMOND output"
    shell:
        "(date && "
        "diamond view --daa {params.ibname} --max-target-seqs 1 --outfmt {params.outfmt} --out {output} && "
@@ -105,7 +105,7 @@ rule annotation_rgi_input:
        rtype="|".join(READ_TYPES),
        tool="|".join(ASSEMBLERS)
    message:
-        "RGI: create input FAA: {input}"
+        "Annotation: RGI input"
    shell:
        # NOTE: remove stop codon symbol "*"
        "sed 's/\*$//' {input} > {output}"
@@ -114,7 +114,9 @@ rule annotation_rgi_input:
 rule annotation_rgi:
    input:
        faa=os.path.join(RESULTS_DIR, "annotation/rgi/{rtype}/{tool}/input.faa"),
-        db=os.path.join(DB_DIR, "rgi/card.json")
+        db=os.path.join(DB_DIR, "rgi/card.json"),
+        # NOTE: to make sure that the same DB is used for all targets
+        setup="status/rgi_setup.done"
    output:
        os.path.join(RESULTS_DIR, "annotation/rgi/{rtype}/{tool}/rgi.txt")
    log:
@@ -131,12 +133,9 @@ rule annotation_rgi:
    conda:
        os.path.join(ENV_DIR, "rgi.yaml")
    message:
-        "RGI: AMR prediction: {input}"
+        "Annotation: AMR prediction w/ RGI"
    shell:
        "(date && "
-        # NOTE: to make sure that the correct DB is used
-        "rgi clean --local && "
-        "rgi load --card_json {input.db} --local && "
        "rgi database --version --local && "
        # NOTE: https://github.com/arpcard/rgi/issues/93: KeyError: 'snp'
        #       need to run the CMD twice
@@ -163,10 +162,10 @@ rule annotation_casc:
        config["casc"]["threads"]
    conda:
        os.path.join(ENV_DIR, "casc.yaml")
+    message:
+        "Annotation: CRISPR detection w/ CASC"
    shell:
        "(date && "
-        # "export PATH={config[casc][path]} && "
-        # "export PERL5LIB={config[casc][perl5lib]} && "
        "export PATH=$PATH:$(dirname {input.bin}) && "
        "casc -i {input.asm} -o $(dirname {output}) -n {threads} --conservative && "
        "date) 2> {log.err} > {log.out}"
@@ -186,9 +185,10 @@ rule annotation_minced:
        tool="|".join(ASSEMBLERS)
    conda:
        os.path.join(ENV_DIR, "minced.yaml")
+    message:
+        "Annotation: CRISPR detection w/ MinCED"
    shell:
        "(date && "
-        # "export PATH={config[minced][path]} && "
        "export PATH=$PATH:$(dirname {input.jar}) && "
        "minced {input.asm} {output.txt} {output.gff} && "
        "date) 2> {log.err} > {log.out}"
@@ -211,7 +211,7 @@ rule annotation_plasflow_input:
        script=os.path.join(SRC_DIR, "filter_fasta_by_length.pl"),
        minlen=config["plasflow"]["minlen"]
    message:
-        "PlasFlow: create input FASTA: {input}"
+        "Annotation: PlasFlow input"
    shell:
        "{params.script} {params.minlen} {input} > {output} 2> {log}"

@@ -233,7 +233,7 @@ rule annotation_plasflow:
    conda:
        os.path.join(ENV_DIR, "plasflow.yaml")
    message:
-        "PlasFlow: predict: {input}"
+        "Annotation: plasmid prediction w/ PlasFlow"
    shell:
        "(date && "
        "PlasFlow.py --input {input} --output {output.tmp} --threshold {params.threshold} && "

--- a/workflow/rules/assembly.smk
+++ b/workflow/rules/assembly.smk
@@ -39,6 +39,8 @@ rule polishing_lr_racon:
        config["racon"]["threads"]
    conda:
        os.path.join(ENV_DIR, "racon.yaml")
+    message:
+        "Assembly: long reads: polishing w/ Racon"
    shell:
        "(date && "
        "samtools view -h {input.bam} > {output.sam} && "
@@ -61,6 +63,8 @@ rule polishing_lr_medaka:
        config["medaka"]["threads"]
    conda:
        os.path.join(ENV_DIR, "medaka.yaml")
+    message:
+        "Assembly: long reads: polishing w/ Medaka"
    shell:
        "(date && "
        "medaka_consensus -i {input.lr} -d {input.asm} -o $(dirname {output}) -t {threads} -m {config[medaka][model]} && "

--- a/workflow/rules/mapping.smk
+++ b/workflow/rules/mapping.smk
@@ -18,6 +18,8 @@ rule mapping_bwa_idx_polishing:
        idx_prefix=lambda wildcards, output: os.path.splitext(output[0])[0]
    conda:
        os.path.join(ENV_DIR, "bwa.yaml")
+    message:
+        "Mapping: BWA index for assembly polishing"
    shell:
        "(date && bwa index {input} -p {params.idx_prefix} && date) 2> {log.err} > {log.out}"

@@ -33,13 +35,15 @@ rule mapping_bwa_mem_polishing:
        err="logs/bwa_mem.polishing.metag.lr.{tool}.err.log"
    wildcard_constraints:
        tool="|".join(config["assemblers"]["lr"])
+    threads:
+        config["bwa"]["threads"]
    params:
        idx_prefix=lambda wildcards, input: os.path.splitext(input.idx[0])[0],
        bam_prefix=lambda wildcards, output: os.path.splitext(output[0])[0]
    conda:
        os.path.join(ENV_DIR, "bwa.yaml")
-    threads:
-        config["bwa"]["threads"]
+    message:
+        "Mapping long reads to assembly w/ BWA for polishing"
    shell:
        "(date && "
        "bwa mem -x ont2d -t {threads} {params.idx_prefix} {input.lr} | "
@@ -67,6 +71,8 @@ rule mapping_bwa_idx_assembly:
        idx_prefix=lambda wildcards, output: os.path.splitext(output[0])[0]
    conda:
        os.path.join(ENV_DIR, "bwa.yaml")
+    message:
+        "Mapping: BWA index for assembly mapping"
    shell:
        "(date && bwa index {input} -p {params.idx_prefix} && date) 2> {log.err} > {log.out}"

@@ -89,13 +95,15 @@ rule mapping_bwa_mem_assembly_sr:
        mtype="|".join(META_TYPES),
        rtype="|".join(READ_TYPES),
        tool="|".join(ASSEMBLERS)
+    threads:
+        config["bwa"]["threads"]
    params:
        idx_prefix=lambda wildcards, input: os.path.splitext(input.idx[0])[0],
        bam_prefix=lambda wildcards, output: os.path.splitext(output[0])[0]
    conda:
        os.path.join(ENV_DIR, "bwa.yaml")
-    threads:
-        config["bwa"]["threads"]
+    message:
+        "Mapping short reads to assembly w/ BWA"
    shell:
        "(date && "
        "bwa mem -t {threads} {params.idx_prefix} {input.r1} {input.r2} | "
@@ -120,13 +128,15 @@ rule mapping_bwa_mem_assembly_lr:
    wildcard_constraints:
        rtype="|".join(["lr", "hy"]),
        tool="|".join(config["assemblers"]["lr"] + config["assemblers"]["hy"])
+    threads:
+        config["bwa"]["threads"]
    params:
        idx_prefix=lambda wildcards, input: os.path.splitext(input.idx[0])[0],
        bam_prefix=lambda wildcards, output: os.path.splitext(output[0])[0]
    conda:
        os.path.join(ENV_DIR, "bwa.yaml")
-    threads:
-        config["bwa"]["threads"]
+    message:
+        "Mapping long reads to assembly w/ BWA"
    shell:
        "(date && "
        "bwa mem -x ont2d -t {threads} {params.idx_prefix} {input.lr} | "
@@ -149,6 +159,8 @@ rule mapping_bwa_mem_assembly_hy:
    threads: 1
    conda:
        os.path.join(ENV_DIR, "bwa.yaml")
+    message:
+        "Mapping: merging short-reads and long-reads mapping results"
    shell:
        "(date && samtools merge {output} {input.sr} {input.lr} && date) 2> {log.err} > {log.out}"

@@ -172,6 +184,8 @@ rule mapping_assembly_genomecov:
    threads: 1
    conda:
        os.path.join(ENV_DIR, "bedtools.yaml")
+    message:
+        "Mapping: compute assembly coverage"
    shell:
        "(date && bedtools genomecov -ibam {input} > {output} && date) 2> {log.err} > {log.out}"

@@ -188,9 +202,11 @@ rule mapping_assembly_genomecov_average:
        rtype1="|".join(READ_TYPES),
        rtype2="|".join(READ_TYPES),
        tool="|".join(ASSEMBLERS)
+    threads: 1
    params:
        script=os.path.join(SRC_DIR, "coverage.awk")
-    threads: 1
+    message:
+        "Mapping: compute average assembly coverage"
    shell:
        "(date && cat {input} | awk -f {params.script} | tail -n+2 > {output} && date) 2> {log.err} > {log.out}"

@@ -212,10 +228,9 @@ rule mapping_assembly_flagstat:
        rtype2="|".join(READ_TYPES),
        tool="|".join(ASSEMBLERS)
    conda:
-        # os.path.join(ENV_DIR, "analysis.yaml")
        os.path.join(ENV_DIR, "bwa.yaml")
    message:
-        "Assembly mapping: samtools flagstat: {input}"
+        "Mapping: assembly coverage stats w/ samtools flagstat"
    shell:
        "(date && samtools flagstat {input} > {output} && date) 2> {log.err} > {log.out}"

@@ -234,10 +249,9 @@ rule mapping_assembly_idxstats:
        rtype2="|".join(READ_TYPES),
        tool="|".join(ASSEMBLERS)
    conda:
-        # os.path.join(ENV_DIR, "analysis.yaml")
        os.path.join(ENV_DIR, "bwa.yaml")
    message:
-        "Assembly mapping: samtools idxstats: {input}"
+        "Mapping: assembly coverage stats w/ samtools idxstats"
    shell:
        "(date && samtools idxstats {input} > {output} && date) 2> {log.err} > {log.out}"

@@ -256,8 +270,9 @@ rule mapping_assembly_uniq:
        rtype2="|".join(READ_TYPES),
        tool="|".join(ASSEMBLERS)
    conda:
-        # os.path.join(ENV_DIR, "analysis.yaml")
        os.path.join(ENV_DIR, "bwa.yaml")
+    message:
+        "Mapping: assembly coverage stats: number of uniquely mapped reads"
    message:
        "Assembly mapping: unique: {input}"
    shell:

--- a/workflow/rules/setup.smk
+++ b/workflow/rules/setup.smk
@@ -9,6 +9,8 @@ rule install_casc:
        path=os.path.join(MOD_DIR, "casc")
    conda:
        os.path.join(ENV_DIR, "casc.yaml")
+    message:
+        "Setup: install CASC"
    shell:
        "(cd {params.path} && "
        "perl Makefile.PL PREFIX=\"$(realpath .)\" && "
@@ -16,7 +18,7 @@ rule install_casc:
        "make test && "
        "make install) 2> {log.err} > {log.out}"

-# minced
+# MinCED
 rule install_minced:
    output:
        os.path.join(MOD_DIR, "minced/minced.jar")
@@ -27,13 +29,15 @@ rule install_minced:
        path=os.path.join(MOD_DIR, "minced")
    conda:
        os.path.join(ENV_DIR, "minced.yaml")
+    message:
+        "Setup: install MinCED"
    shell:
        "(cd {params.path} && "
        "make && "
        "make test) 2> {log.err} > {log.out}"

-# RGI DBs
-rule annotation_rgi_db:
+# Download RGI data
+rule download_rgi_db:
    output:
        archive=temp(os.path.join(DB_DIR, "rgi/card-data.tar.bz2")),
        json=os.path.join(DB_DIR, "rgi/card.json")
@@ -43,9 +47,28 @@ rule annotation_rgi_db:
    params:
        db_url=config["rgi"]["db_url"]
    message:
-        "RGI: Download DB data"
+        "Setup: download RGI data"
    shell:
        "(date && "
        "wget -O {output.archive} {params.db_url} --no-check-certificate && "
        "tar -C $(dirname {output.archive}) -xvf {output.archive} && "
        "date) 2> {log.err} > {log.out}"
+
+# Setup RGI: load required DB
+# NOTE: to make sure that the same DB is used for all targets
+rule setup_rgi_db:
+    input:
+        os.path.join(DB_DIR, "rgi/card.json")
+    output:
+        "status/rgi_setup.done"
+    log:
+        out="logs/rgi_setup.out.log",
+        err="logs/rgi_setup.err.log"
+    conda:
+        os.path.join(ENV_DIR, "rgi.yaml")
+    message:
+        "Setup: load RGI DB"
+    shell:
+        "(rgi clean --local && "
+        "rgi load --card_json {input} --local && "
+        "rgi database --version --local) 2> {log.err} > {log.out}"