added rules for diamond analyses

17bb9496 · Susheel Busi · 802dba2d · 17bb9496 · 17bb9496 · 17bb9496
Commit 17bb9496 authored 4 years ago by Susheel Busi
--- a/config/Zymo/config.zymo.yaml
+++ b/config/Zymo/config.zymo.yaml
+# Main requirements
 work_dir: "/scratch/users/sbusi/zymo"
 results_dir: "results"
+data_dir: "/scratch/users/sbusi/zymo/Zymo/results"

+# Assembly
+# List of assemblers for different read types: assembler names MUST be UNIQUE
+assemblers:
+    sr: ["megahit", "metaspades"]
+    lr: ["flye", "canu"]
+    hy: ["metaspadeshybrid", "operamsmegahit", "operamsmetaspades"]
+
+
+# Additional params   
 single_fast5_dir: "data_single_fast5"
 multi_fast5_dir: "data_multi_fast5"
 ref_genomes_dir: "data_ref_genomes"

+# Annotation tools
+prodigal:
+    threads: 12
+    
 diamond:
    threads: 20
\ No newline at end of file
--- a/config/Zymo/sbatch.zymo.sh
+++ b/config/Zymo/sbatch.zymo.sh
@@ -36,7 +36,12 @@ ONTP_CLUSTER="{cluster.call} -t {cluster.runtime} -n {cluster.threads} -p {clust
 # activate the env
 conda activate ${ONTP_ENV}

-# run the pipeline
+# unlock the pipeline
 snakemake -s ${ONTP_SMK} -rp --jobs 10 --local-cores 1 \
+ --configfile ${ONTP_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
+ --cluster-config ${ONTP_SLURM} --cluster "${ONTP_CLUSTER}" --unlock 
+ 
+# run the pipeline
+snakemake -s ${ONTP_SMK} -rpn --jobs 10 --local-cores 1 \
 --configfile ${ONTP_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
 --cluster-config ${ONTP_SLURM} --cluster "${ONTP_CLUSTER}"
--- a/config/Zymo/slurm.zymo.yaml
+++ b/config/Zymo/slurm.zymo.yaml
@@ -17,12 +17,17 @@ concatenate:
  quality: "normal"
  runtime: "0-00:05:00"

-annotation_prodigal:
+zymo_prodigal:
  partition: "batch"
  qos: "normal"
-  runtime: "00-2:00:00"
+  runtime: "00-1:00:00"

-annotation_diamond:
+diamond_db:
+  partition: "batch"
+  quality: "normal"
+  runtime: "00-00:30:00"
+
+analysis_diamond:
  partition: "batch"
  quality: "normal"
  runtime: "00-2:00:00"
\ No newline at end of file
--- a/workflow_zymo/Snakefile
+++ b/workflow_zymo/Snakefile
@@ -2,6 +2,7 @@
 # Python modules
 ##################################################
 import os
+import re

 ##############################
 # CONFIG
@@ -29,6 +30,26 @@ SINGLE_FAST5_DIR = os.path.abspath(config["single_fast5_dir"])
 MULTI_FAST5_DIR  = os.path.abspath(config["multi_fast5_dir"])
 REF_GENOMES_DIR = os.path.abspath(config["ref_genomes_dir"])
 RESULTS_DIR = config["results_dir"]
+DATA_DIR = config["data_dir"]
+
+##################################################
+# FUNCTIONS
+##################################################
+def assembler_pairs(assemblers):
+    """
+    Create all possible combinations of two assemblers
+    """
+    from itertools import combinations
+    return list(combinations(assemblers, 2))
+
+##################################################
+# ADDITIONAL PARAMETERS
+##################################################
+READ_TYPES      = list(config["assemblers"].keys()) # list of read types
+ASSEMBLERS      = [y for x in config["assemblers"].values() for y in x] # list of all assemblers
+READ_ASSEMBLERS = [y for x in [[(k, vv) for vv in v] for k, v in config["assemblers"].items()] for y in x] # list of (read type, assembler)
+READ_ASSEMBLER_PAIRS = assembler_pairs(READ_ASSEMBLERS)
+
 ##################################################
 # RULES
 ##################################################
@@ -36,22 +57,20 @@ RESULTS_DIR = config["results_dir"]
 rule all:
    input:
        REF_GENOMES_DIR,
-        expand(os.path.join(RESULTS_DIR,"reference/mock_genomes.fasta"))
+        expand(os.path.join(RESULTS_DIR,"reference/mock_genomes.fasta")),
+        expand(os.path.join(RESULTS_DIR,"annotation/zymo_prodigal/proteins.faa")),
+        expand(os.path.join(RESULTS_DIR,"annotation/diamond/zymo.dmnd")),
+        expand(os.path.join(RESULTS_DIR, "analysis/diamond/{rtype_tool}.{ext}"), 
+            rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS],
+            ext=["daa", "tsv"]) 
+

 rule download_genomes:
-#    input:
-#        SINGLE_FAST5_DIR
    output:
        directory(REF_GENOMES_DIR)
    log:
        out="logs/ref_genomes.out.log",
        err="logs/ref_genomes.err.log"
-#    threads:
-#        config["ont_fast5_api"]["single_to_multi_fast5"]["threads"]
-#    params:
-#        batch_size=config["ont_fast5_api"]["single_to_multi_fast5"]["batch_size"]
-#    conda:
-#        os.path.join(ENV_DIR, "ont_fast5_api.yaml")
    shell:
       "(date && "
       "wget https://github.com/al-mcintyre/mCaller_analysis_scripts/raw/master/assemblies/bsubtilis_pb.fasta -P {output} && "
@@ -79,4 +98,65 @@ rule concatenate:
    shell:
        "(date && "
        "cat {input}/*.fasta > {output} && "
-        "date) 2> {log.err} > {log.out}"
\ No newline at end of file
+        "date) 2> {log.err} > {log.out}"
+
+rule zymo_prodigal:
+    input:
+        rules.concatenate.output
+    output:
+        os.path.join(RESULTS_DIR,"annotation/zymo_prodigal/proteins.faa")
+    log:
+        out="logs/zymo_prodigal.out.log",
+        err="logs/zymo_prodigal.err.log"
+    threads:
+        config["prodigal"]["threads"]
+    conda:
+        os.path.join(ENV_DIR, "prodigal.yaml")
+    shell:
+       "(date && "
+       "prodigal -a {output} -p meta -i {input} && "
+       "date) 2> {log.err} > {log.out}"
+
+rule diamond_db:
+    input:
+        rules.zymo_prodigal.output
+    output:
+        os.path.join(RESULTS_DIR,"annotation/diamond/zymo.dmnd")
+    log:
+        out="logs/diamond_db.out.log",
+        err="logs/diamond_db.err.log"
+    threads:
+        config["diamond"]["threads"]
+    conda:
+        os.path.join(ENV_DIR, "diamond.yaml")
+    shell:
+       "(date && "
+       "diamond makedb --in {input} -d $(echo {output} | sed 's/.dmnd$//') && "
+       "date) 2> {log.err} > {log.out}"
+
+rule analysis_diamond:
+    input: 
+        faa=os.path.join(DATA_DIR, "annotation/prodigal/{rtype}/{tool}/proteins.faa"),
+        db=rules.diamond_db.output
+    output:
+        daa=os.path.join(RESULTS_DIR, "analysis/diamond/{rtype}_{tool}.daa"),
+        tsv=os.path.join(RESULTS_DIR, "analysis/diamond/{rtype}_{tool}.tsv")
+    log:
+        "logs/analysis.diamond.{rtype}.{tool}.log",
+    wildcard_constraints:
+        rtype="|".join(READ_TYPES),
+        tool="|".join(ASSEMBLERS)
+    threads:
+        config["diamond"]["threads"]
+    params:
+        outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
+    conda:
+        os.path.join(ENV_DIR, "diamond.yaml")
+    message:
+        "Analysis: DIAMOND search: {input}"
+    shell: 
+        "(date && "
+        "daa={output.daa} && "
+        "diamond blastp -q {input.faa} --db {input.db} --out {output.daa} -p {threads} --outfmt 100 && "
+        "diamond view --daa ${{daa%.*}} --max-target-seqs 1 -p {threads} --outfmt {params.outfmt} --out {output.tsv} && "
+        "date) &> {log}"