config: updated GDB and Zymo

778fc6a9 · Valentina Galata · f54b9e60 · 778fc6a9 · 778fc6a9 · 778fc6a9
Commit 778fc6a9 authored 4 years ago by Valentina Galata
--- a/config/GDB/config.yaml
+++ b/config/GDB/config.yaml
@@ -3,9 +3,9 @@

 # Steps to be done
 # steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
-steps: ["preprocessing", "assembly", "mapping"]
-steps_annotation: ["rgi", "plasflow", "minced", "barrnap"] # prodigal is run in any case
-steps_analysis: ["quast", "cdhit", "diamond", "mash"]
+steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis"]
+steps_annotation: ["rgi", "plasflow", "minced", "barrnap", "kegg"] # prodigal is run in any case
+steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond"]
 steps_taxonomy: ["kraken2", "kaiju"]

 ############################################################
@@ -183,6 +183,11 @@ bbmap:
        # key: url of GZ archive
        GCF_000001405.38_GRCh38.p12: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"

+# HMMs
+hmm:
+    threads: 10
+    kegg: "KO_cdhitGe10000_160314.hmm"
+
 # Assembly quality
 # https://github.com/ablab/quast
 quast:

--- a/config/GDB/sbatch.sh
+++ b/config/GDB/sbatch.sh
@@ -15,7 +15,8 @@ SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
 SMK_CONFIG="config/GDB/config.yaml"
 SMK_SLURM="config/GDB/slurm.yaml"
 # slurm cluster call
-SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} --ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
+SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
+--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"

 conda activate ${SMK_ENV} && \
 snakemake -s workflow/Snakefile -rp --jobs 7 --local-cores 1 \

--- a/config/GDB/slurm.yaml
+++ b/config/GDB/slurm.yaml
@@ -56,7 +56,7 @@ assembly_hy_metaspades:
    partition: "bigmem"

 assembly_hy_operams:
-    time: "00-12:00:00"
+    time: "00-16:00:00"
    partition: "bigmem"

 # Assembly polishing
@@ -95,7 +95,7 @@ annotation_prodigal:
    partition: "batch"

 annotation_hmm_kegg:
-    time: "00-6:00:00"
+    time: "00-8:00:00"
    partition: "batch"

 annotation_plasflow:

--- a/config/Zymo/config.yaml
+++ b/config/Zymo/config.yaml
@@ -4,8 +4,8 @@
 # Steps to be done
 # steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
 steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
-steps_annotation: ["diamond", "rgi", "plasflow", "minced", "barrnap"] # prodigal is run in any case
-steps_analysis: ["quast", "cdhit", "mash_dist"]
+steps_annotation: ["rgi", "plasflow", "minced", "barrnap", "kegg"] # prodigal is run in any case
+steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond"]
 steps_taxonomy: ["kraken2", "kaiju"]

 ############################################################
@@ -16,8 +16,8 @@ work_dir: "/scratch/users/vgalata/Zymo"

 # Paths WITHIN the working directory
 # directory containing required DBs (should be writeable)
-db_dir: "dbs"
-# results directory
+db_dir: "/mnt/lscratch/users/vgalata/ONT_pilot_DBs"
+# results directory (will be created in work_dir)
 results_dir: "results"

 # Data paths: Use absolute paths or paths relative to the working directory !!!
@@ -49,9 +49,7 @@ data:
 ##############################
 # Preprocessing

-# TODO: installation ???
-# Preprocessing: LR: Basecalling
-# XXX
+# https://community.nanoporetech.com/protocols/Guppy-protocol/v/GPB_2003_v1_revT_14Dec2018
 guppy:
    config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
    gpu:
@@ -66,49 +64,31 @@ guppy:
        gpu_device: "cuda:0"
        threads: 20

-# Preprocessing: SR
 # https://github.com/OpenGene/fastp
 fastp:
    threads: 10
    min_length: 40

-# FastQ QC
 # https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
 fastqc:
    threads: 10
-    params: "-q -f fastq"

 ##############################
 # Assembly

 # List of assemblers for different read types: assembler names MUST be UNIQUE
-# Allowed values:
-#   SR: megahit, metaspades
-#   Hy: metaspadeshybrid (metaspades w/ LR), operams
-#   LR: flye, wtdbg2, canu
-#   HyHy: imp3 (IMP3 assembly using LR and SR from metaT and metaG), assembly is not part of the pipeline
-#   Polishing w/ SR: suffix "_sr" for hybrid and LR assemblers
 assemblers:
    sr: ["megahit", "metaspades"]
-    lr: ["flye", "wtdbg2", "flye_sr", "wtdbg2_sr"]
-    hy: ["metaspadeshybrid", "operams", "metaspadeshybrid_sr", "operams_sr"]
-    hyhy: []
+    lr: ["flye", "canu"]
+    hy: ["metaspadeshybrid", "operamsmegahit", "operamsmetaspades"]

 # https://github.com/fenderglass/Flye
 flye:
    threads: 10
-    genome_size: "1g"
-
-# https://github.com/ruanjue/wtdbg2
-wtdbg2:
-    threads: 10
-    bin: "/scratch/users/sbusi/tools/wtdbg2/"
-    genome_size: "1g"

 # https://canu.readthedocs.io/en/latest/
 canu:
    threads: 24
-    # mem: "64g"
    genome_size: "1g"

 # https://github.com/ablab/spades
@@ -119,14 +99,12 @@ metaspades:
 megahit:
    threads: 10

-# TODO: installation
 # https://github.com/CSB5/OPERA-MS
 operams:
    threads: 10
-    bin: "/home/users/sbusi/apps/miniconda3/envs/operams/OPERA-MS/OPERA-MS.pl"

 ##############################
-# Long-read assembly polishing
+# Assembly polishing

 # https://nanoporetech.github.io/medaka/index.html
 medaka:
@@ -140,14 +118,10 @@ racon:
 ##############################
 # Mapping

-# Mapper
 # http://bio-bwa.sourceforge.net/
 bwa:
    threads: 10
-    long_reads_index:
-        opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"

-# SAM utils
 # http://www.htslib.org/doc/samtools.html
 samtools:
    sort:
@@ -157,35 +131,28 @@ samtools:
 ##############################
 # Annotation

-# Sequence search
 # https://github.com/bbuchfink/diamond
 diamond:
    threads: 20
-    db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd" # TODO: data download
+    db: "nr_uniprot_trembl.dmnd" # file name in "dbs" folder

-# CRISPR
 # https://github.com/dnasko/CASC
 casc:
    threads: 10

-# CRISPR
 # https://github.com/ctSkennerton/minced
 # minced:

-# Plasmid prediction
 # https://github.com/smaegol/PlasFlow
-plasflow:
-    threshold: 0.7 # class. prob. threshold
-    minlen: 1000 # rm contigs with length below this threshold
+# plasflow:
+#     threshold: 0.7 # class. prob. threshold
+#     minlen: 1000 # rm contigs with length below this threshold

-# AMR prediction
 # https://github.com/arpcard/rgi
 rgi:
    threads: 5
    db_url: "https://card.mcmaster.ca/latest/data"
-    alignment_tool: "DIAMOND"

-# rRNA genes prediction
 # https://github.com/tseemann/barrnap
 barrnap:
    threads: 5
@@ -202,42 +169,39 @@ cdhit:
 # https://github.com/BioInfoTools/BBMap/
 bbmap:
    threads: 10
-    # References to be used (w/ md5sums)
-    rrna_refs: [
-        # c0cd2aa2e84e3e3977859c34feb63cd5  /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/rfam-5.8s-database-id98.fasta
-        # 703e4c270ab0a578deb4800c33b36367  /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/rfam-5s-database-id98.fasta
-        # 8b4e6c6f17f6f35444a60fdc915e052c  /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-arc-16s-id95.fasta
-        # ca4edcdddb98d7868f93e2308e297704  /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-arc-23s-id98.fasta
-        # db6e72022cf650c4b33bd888b92a0391  /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-bac-16s-id90.fasta
-        # f347d2f8f8ffbfa28c785e3a9fe3db79  /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-bac-23s-id98.fasta
-        # 878a413765d09c3ec75409fb1d1573f1  /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-euk-18s-id95.fasta
-        # cbb973e63f52981bd591de0404df5839  /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-euk-28s-id98.fast
-        "/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/rfam-5.8s-database-id98.fasta",
-        "/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/rfam-5s-database-id98.fasta",
-        "/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-arc-16s-id95.fasta",
-        "/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-arc-23s-id98.fasta",
-        "/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-bac-16s-id90.fasta",
-        "/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-bac-23s-id98.fasta",
-        "/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-euk-18s-id95.fasta",
-        "/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-euk-28s-id98.fasta"
+    rrna_refs: [ # file names in "dbs" folder
+        "sortmerna/rfam-5.8s-database-id98.fasta",
+        "sortmerna/rfam-5s-database-id98.fasta",
+        "sortmerna/silva-arc-16s-id95.fasta",
+        "sortmerna/silva-arc-23s-id98.fasta",
+        "sortmerna/silva-bac-16s-id90.fasta",
+        "sortmerna/silva-bac-23s-id98.fasta",
+        "sortmerna/silva-euk-18s-id95.fasta",
+        "sortmerna/silva-euk-28s-id98.fasta"
    ]
+    host_refs: null
+
+# HMMs
+hmm:
+    threads: 10
+    kegg: "KO_cdhitGe10000_160314.hmm"

 # Assembly quality
 # https://github.com/ablab/quast
 quast:
    threads: 10

-# Sequence search and clustering
-# https://github.com/soedinglab/MMseqs2
-# mmseqs2:
-    # threads: 30
-    # createdb: "--dbtype 2 --shuffle -v"
-    # easycluster: "--kmer-per-seq-scale 0.5 --cov-mode 0 -c 0.5 --min-seq-id 0.9"
-    # easylinclust: "--kmer-per-seq-scale 0.5 --cov-mode 0 -c 0.5 --min-seq-id 0.9"
-    # path: "/home/users/sbusi/apps/mmseqs/bin"
-    # createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
-    # rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
-    # convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
+# https://github.com/marbl/mash
+mash:
+    threads: 10
+
+# https://github.com/marbl/MashMap
+mashmap:
+    threads: 10
+
+# https://github.com/ParBLiSS/FastANI
+fastani:
+    threads: 10

 ##############################
 # Taxonomy
@@ -246,49 +210,19 @@ quast:
 # https://github.com/DerrickWood/kraken2
 kraken2:
    threads: 10
-    db:
-        maxikraken: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
-    class:
-        sr: "--gzip-compressed --paired"
-        lr: ""
-        contigs: ""
+    db: # dir. name in "dbs" folder
+        maxikraken: "maxikraken2_1903_140GB"

 # http://kaiju.binf.ku.dk/
 # http://kaiju.binf.ku.dk/server
 # https://github.com/bioinformatics-centre/kaiju
 kaiju:
    threads: 10
-    db: # key = basename of *.fmi
-        kaiju_db_nr_euk: "/mnt/isilon/projects/ecosystem_biology/databases/kaiju/kaiju_db_nr_euk_2020-05-25"
+    db: # dir. name in "dbs" folder
+        # key = basename of *.fmi
+        kaiju_db_nr_euk: "kaiju_db_nr_euk_2020-05-25"
    ranks: ["phylum", "class", "order", "family", "genus", "species"]

-# # XXX
-# GTDBTK:
-#     DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
-
-##############################
-# MISC
-
-# https://github.com/marbl/mash
-mash:
-    threads: 10
-
-##############################
-# Binning
-
-# DAS_Tool:
-#     path: "/home/users/sbusi/apps/DAS_Tool-master"
-#     bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
-#     db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
-#     Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
-# # Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
-# # dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
-
-##############################
-# ???
-# nonpareil:
-#     memory: 4096
-#     threads: 14
-
-# rebaler:
-#     threads: 28
+# https://github.com/Ecogenomics/GTDBTk
+GTDBTK: # dir. name in "dbs" folder
+    DATA: "gtdbtk_release89"
--- a/config/Zymo/sbatch.fast5.sh
+++ b/config/Zymo/sbatch.fast5.sh
@@ -15,7 +15,8 @@ SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
 SMK_CONFIG="config/Zymo/config.fast5.yaml"
 SMK_SLURM="config/Zymo/slurm.fast5.yaml"
 # slurm cluster call
-SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} --ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
+SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
+--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"

 conda activate ${SMK_ENV} && \
 snakemake -s workflow/Snakefile -rp --jobs 7 --local-cores 1 \

--- a/config/Zymo/sbatch.sh
+++ b/config/Zymo/sbatch.sh
@@ -15,7 +15,8 @@ SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
 SMK_CONFIG="config/Zymo/config.yaml"
 SMK_SLURM="config/Zymo/slurm.yaml"
 # slurm cluster call
-SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} --ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
+SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
+--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"

 conda activate ${SMK_ENV} && \
 snakemake -s workflow/Snakefile -rp --jobs 7 --local-cores 1 \