zymo configs: updated

14c69148 · Valentina Galata · 29822190 · 14c69148 · 14c69148 · 14c69148
Commit 14c69148 authored 4 years ago by Valentina Galata
--- a/config/Zymo/config.fast5.yaml
+++ b/config/Zymo/config.fast5.yaml
-work_dir: "/mnt/lscratch/users/vgalata/Zymo-GridION-EVEN-BB-SN"
+work_dir: "/scratch/users/vgalata/Zymo"

 single_fast5_dir: "data_single_fast5"
 multi_fast5_dir: "data_multi_fast5"

--- a/config/Zymo/config.yaml
+++ b/config/Zymo/config.yaml
 ############################################################
 # STEPS

-# Pipeline steps to be done: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
+# Steps to be done
+# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
 steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
+steps_annotation: ["diamond", "rgi", "plasflow", "minced", "barrnap"] # prodigal is run in any case
+steps_analysis: ["quast", "cdhit", "mash_dist"]
+steps_taxonomy: ["kraken2", "kaiju"]

 ############################################################
 # INPUT

-# working directory: will contain the results
-work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/mock"
+# working directory: will contain the results (should be writeable)
+work_dir: "/scratch/users/vgalata/Zymo"

 # Paths WITHIN the working directory
-# directory containing required DBs
+# directory containing required DBs (should be writeable)
 db_dir: "dbs"
 # results directory
 results_dir: "results"
@@ -21,11 +25,11 @@ data:
    # Meta-genomics
    metag:
        sr: 
-            r1: "data/raw/short_reads/ERR2984773/ERR2984773_1.fastq.gz"
-            r2: "data/raw/short_reads/ERR2984773/ERR2984773_2.fastq.gz"
+            r1: "/scratch/users/vgalata/Zymo/data_sr/ERR2984773_1.fastq.gz"
+            r2: "/scratch/users/vgalata/Zymo/data_sr/ERR2984773_2.fastq.gz"
        ont:
            # List of directories containing FAST5 files
-            dirs: ["/mnt/lscratch/users/vgalata/Zymo-GridION-EVEN-BB-SN/data_multi_fast5/"] # leave empty if no data, i.e. []
+            dirs: ["/scratch/users/vgalata/Zymo/data_multi_fast5/"] # leave empty if no data, i.e. []
            # List of FAST5 files
            files: [] # leave empty if no data, i.e. []
            # FastQ: if given NO basecalling will be done !!!
@@ -36,7 +40,7 @@ data:
            r1: "" # leave empty if no data, i.e. ""
            r2: "" # leave empty if no data, i.e. ""
    # Meta-proteomics
-    metap:
+    # metap:
        # TODO

 ############################################################
@@ -45,7 +49,7 @@ data:
 ##############################
 # Preprocessing

-# TODO: installation
+# TODO: installation ???
 # Preprocessing: LR: Basecalling
 # XXX
 guppy:
@@ -63,11 +67,17 @@ guppy:
        threads: 20

 # Preprocessing: SR
-# XXX
+# https://github.com/OpenGene/fastp
 fastp:
    threads: 10
    min_length: 40

+# rRNA gene filtering
+sortmerna:
+    threads: 20
+    # References to be used (w/ md5sums)
+    refs: []
+
 # FastQ QC
 # https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
 fastqc:
@@ -77,22 +87,35 @@ fastqc:
 ##############################
 # Assembly

-# List of assemblers for different read types
+# List of assemblers for different read types: assembler names MUST be UNIQUE
 assemblers:
    sr: ["megahit", "metaspades"]
-    lr: ["flye"]
+    lr: ["flye", "wtdbg2"]
    hy: ["metaspadeshybrid", "operams"]
+    hyhy: []

-# XXX
+# https://github.com/fenderglass/Flye
 flye:
    threads: 10
    genome_size: "1g"

-# XXX
+# https://github.com/ruanjue/wtdbg2
+wtdbg2:
+    threads: 10
+    bin: "/scratch/users/sbusi/tools/wtdbg2/"
+    genome_size: "1g"
+
+# https://canu.readthedocs.io/en/latest/
+canu:
+    threads: 24
+    # mem: "64g"
+    genome_size: "1g"
+
+# https://github.com/ablab/spades
 metaspades:
    threads: 10

-# XXX
+# https://github.com/voutcn/megahit
 megahit:
    threads: 10

@@ -105,12 +128,12 @@ operams:
 ##############################
 # Long-read assembly polishing

-# XXX
+# https://nanoporetech.github.io/medaka/index.html
 medaka:
-    threads: 10 # do NOT set to large value (e.g. using 30 did not work)
+    threads: 10 # NOTE: avoid large values !!! e.g. 30 did not work
    model: r941_min_high # the MinION model, high accuarcy

-# XXX
+# https://github.com/isovic/racon
 racon:
    threads: 30

@@ -128,33 +151,25 @@ bwa:
 # http://www.htslib.org/doc/samtools.html
 samtools:
    sort:
-        # threads: 10
        chunk_size: "4G"
-    view:
-        # threads: 10

 ##############################
 # Annotation

-# TODO: data download
 # Sequence search
-# XXX
+# https://github.com/bbuchfink/diamond
 diamond:
    threads: 20
-    #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
-    db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
+    db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd" # TODO: data download

 # CRISPR
 # https://github.com/dnasko/CASC
 casc:
    threads: 10
-    # path: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
-    # perl5lib: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"

 # CRISPR
 # https://github.com/ctSkennerton/minced
-minced:
-    # path: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
+# minced:

 # Plasmid prediction
 # https://github.com/smaegol/PlasFlow
@@ -167,34 +182,42 @@ plasflow:
 rgi:
    threads: 5
    db_url: "https://card.mcmaster.ca/latest/data"
-    alignment_tool: "DIAMOND" # DIAMOND or BLAST
+    alignment_tool: "DIAMOND"
+
+# rRNA genes prediction
+# https://github.com/tseemann/barrnap
+barrnap:
+    threads: 5
+    kingdom: ["bac", "arc", "euk", "mito"]

 ##############################
 # Analysis

+# https://github.com/weizhongli/cdhit --> wiki
+cdhit:
+    threads: 10
+
 # XXX
 bbmap:
    threads: 10

 # Assembly quality
-# XXX
+# https://github.com/ablab/quast
 quast:
    threads: 10

 # Sequence search and clustering
 # https://github.com/soedinglab/MMseqs2
-mmseqs2:
-    threads: 30
+# mmseqs2:
+    # threads: 30
+    # createdb: "--dbtype 2 --shuffle -v"
+    # easycluster: "--kmer-per-seq-scale 0.5 --cov-mode 0 -c 0.5 --min-seq-id 0.9"
+    # easylinclust: "--kmer-per-seq-scale 0.5 --cov-mode 0 -c 0.5 --min-seq-id 0.9"
    # path: "/home/users/sbusi/apps/mmseqs/bin"
    # createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
    # rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
    # convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"

-# Seq. alignment
-# https://mummer4.github.io/
-mummer:
-    archive: "https://github.com/mummer4/mummer/releases/download/v3.9.4alpha/mummer-3.9.4alpha.tar.gz"
-
 ##############################
 # Taxonomy

@@ -206,8 +229,17 @@ kraken2:
        maxikraken: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
    class:
        sr: "--gzip-compressed --paired"
-        lr: "" # TODO
-        contigs: "" # TODO
+        lr: ""
+        contigs: ""
+
+# http://kaiju.binf.ku.dk/
+# http://kaiju.binf.ku.dk/server
+# https://github.com/bioinformatics-centre/kaiju
+kaiju:
+    threads: 10
+    db: # key = basename of *.fmi
+        kaiju_db_nr_euk: "/mnt/isilon/projects/ecosystem_biology/databases/kaiju/kaiju_db_nr_euk_2020-05-25"
+    ranks: ["phylum", "class", "order", "family", "genus", "species"]

 # # XXX
 # GTDBTK:

--- a/config/Zymo/sbatch.sh
+++ b/config/Zymo/sbatch.sh
 #!/bin/bash -l

-##############################
-# SLURM
-# NOTE: used for this script only, NOT for the snakemake call below
-
+# slurm settings if called using sbatch
 #SBATCH -J ONT_SMK
 #SBATCH -N 1
 #SBATCH -n 1
 #SBATCH -c 1
-#SBATCH --time=0-10:00:00
+#SBATCH --time=3-00:00:00
 #SBATCH -p batch
 #SBATCH --qos=qos-batch

-##############################
-# SNAKEMAKE
-
-# conda env name
+# conda env name or path
 ONTP_ENV="ONT_pilot"
-# number of cores for snakemake
-ONTP_CORES=60
-# snakemake file
-ONTP_SMK="workflow/Snakefile"
-# config file
-ONTP_CONFIG="config/Zymo-GridION-EVEN-BB-SN/config.yaml" # USER INPUT REQUIRED
-# slurm config file
-ONTP_SLURM="config/Zymo-GridION-EVEN-BB-SN/slurm.yaml"
+# config files
+ONTP_CONFIG="config/Zymo/config.yaml"
+ONTP_SLURM="config/Zymo/slurm.yaml"
 # slurm cluster call
-ONTP_CLUSTER="-p {cluster.partition} -q {cluster.qos} {cluster.explicit} -N {cluster.nodes} -n {cluster.n} -c {threads} -t {cluster.time} --job-name={cluster.job-name}"
-
-##############################
-# IMP
+ONTP_CLUSTER="sbatch -p {cluster.partition} -q {cluster.qos} {cluster.explicit} -N {cluster.nodes} -n {cluster.n} -c {threads} -t {cluster.time} --job-name={cluster.job-name}"

-# activate the env
 conda activate ${ONTP_ENV}

-# run the pipeline
-snakemake -s ${ONTP_SMK} -rp --cores ${ONTP_CORES} --configfile ${ONTP_CONFIG} \
--use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
--cluster-config ${ONTP_SLURM} --cluster "sbatch ${ONTP_CLUSTER}"
+snakemake -s workflow/Snakefile -rp --jobs 10 --local-cores 1 \
+--configfile ${ONTP_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
+--cluster-config ${ONTP_SLURM} --cluster "${ONTP_CLUSTER}"
--- a/config/Zymo/slurm.yaml
+++ b/config/Zymo/slurm.yaml
@@ -27,6 +27,14 @@ fastp_sr:
    n: 1
    explicit: ""

+sortmerna_filt:
+    time: "01-12:00:00"
+    partition: "batch"
+    qos: "qos-batch"
+    nodes: 1
+    n: 1
+    explicit: ""
+
 # Assembly
 assembly_lr_flye:
    time: "00-10:00:00"
@@ -36,6 +44,22 @@ assembly_lr_flye:
    n: 1
    explicit: ""

+assembly_lr_wtdbg2:
+    time: "00-10:00:00"
+    partition: "bigmem"
+    qos: "qos-bigmem"
+    nodes: 1
+    n: 1
+    explicit: ""
+
+assembly_lr_canu:
+    time: "05-00:00:00"
+    partition: "bigmem"
+    qos: "qos-bigmem"
+    nodes: 1
+    n: 1
+    explicit: ""
+
 assembly_sr_megahit:
    time: "01-4:00:00"
    partition: "bigmem"
@@ -77,7 +101,15 @@ mapping_bwa_idx_assembly:
    n: 1
    explicit: ""

-mapping_bwa_mem_assembly_sr:
+mapping_bwa_mem_assembly_sr_metag:
+    time: "00-10:00:00"
+    partition: "batch"
+    qos: "qos-batch"
+    nodes: 1
+    n: 1
+    explicit: ""
+
+mapping_bwa_mem_assembly_sr_metat:
    time: "00-10:00:00"
    partition: "batch"
    qos: "qos-batch"
@@ -101,6 +133,14 @@ mapping_bwa_mem_assembly_hy:
    n: 1
    explicit: ""

+mapping_bwa_mem_assembly_hyhy:
+    time: "00-10:00:00"
+    partition: "batch"
+    qos: "qos-batch"
+    nodes: 1
+    n: 1
+    explicit: ""
+
 # Assembly polishing
 mapping_bwa_idx_polishing:
    time: "00-10:00:00"
@@ -134,6 +174,22 @@ polishing_lr_medaka:
    n: 1
    explicit: ""

+mapping_bwa_mem_polishing_metat:
+    time: "00-10:00:00"
+    partition: "batch"
+    qos: "qos-batch"
+    nodes: 1
+    n: 1
+    explicit: ""
+
+polishing_metat_racon:
+    time: "00-10:00:00"
+    partition: "bigmem"
+    qos: "qos-bigmem"
+    nodes: 1
+    n: 1
+    explicit: ""
+
 # Annotation
 annotation_prodigal:
    time: "01-4:00:00"
@@ -159,7 +215,8 @@ annotation_plasflow:
    n: 1
    explicit: ""

-kraken2_contigs:
+# Taxonomy
+tax_kraken2_contigs:
    time: "00-02:00:00"
    partition: "bigmem"
    qos: "qos-bigmem"
@@ -167,7 +224,7 @@ kraken2_contigs:
    n: 1
    explicit: ""

-kraken2_sr:
+tax_kraken2_sr:
    time: "00-02:00:00"
    partition: "bigmem"
    qos: "qos-bigmem"
@@ -175,10 +232,26 @@ kraken2_sr:
    n: 1
    explicit: ""

-kraken2_lr:
+tax_kraken2_lr:
    time: "00-02:00:00"
    partition: "bigmem"
    qos: "qos-bigmem"
    nodes: 1
    n: 1
    explicit: ""
+
+tax_kaiju:
+    time: "00-01:00:00"
+    partition: "bigmem"
+    qos: "qos-bigmem"
+    nodes: 1
+    n: 1
+    explicit: ""
+
+tax_kaiju_summary:
+    time: "00-00:10:00"
+    partition: "batch"
+    qos: "qos-batch"
+    nodes: 1
+    n: 1
+    explicit: ""
\ No newline at end of file