init setup for preprocessing and assembly

6cbf5ed6 · Valentina Galata · ab685564 · 6cbf5ed6 · 6cbf5ed6 · 6cbf5ed6
Commit 6cbf5ed6 authored 4 years ago by Valentina Galata
--- a/Snakefile
+++ b/Snakefile
@@ -39,7 +39,7 @@ INPUT_FAST5 = []
 INPUT_FAST5 += [os.path.abspath(f) for f in config["data"]["ont"]["files"]]
 for ont_d in config["data"]["ont"]["dirs"]:
    INPUT_FAST5 += [os.path.abspath(f) for f in Path(ont_d).rglob('*.fast5')]
-INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
+# INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
 # SR files
 INPUT_SR = config["data"]["sr"]
 # MetaT files
@@ -49,19 +49,14 @@ INPUT_METAT = config["data"]["metat"]
 DATA_FAST5 = [os.path.join(RESULTS_DIR, "input_ont", os.path.basename(f)) for f in INPUT_FAST5]
 assert len(DATA_FAST5) == len(set(DATA_FAST5)), "Created link names for FAST5 files are NOT unique: {}".format(DATA_FAST5)
 DATA_SR = {
-    "r1": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r1"])),
+    "r1": os.path.join(RESULTS_DIR, "input_sr/R1.fq.gz"),
-    "r2": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r2"]))
+    "r2": os.path.join(RESULTS_DIR, "input_sr/R2.fq.gz")
 }
 DATA_METAT = {
-    "r1": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r1"])),
+    "r1": os.path.join(RESULTS_DIR, "input_metat/R1.fq.gz"),
-    "r2": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r2"]))
+    "r2": os.path.join(RESULTS_DIR, "input_metat/R2.fq.gz")
 }
-# # Tools
-# ASSEMBLERS = config["assemblers"]
-# HYBRID_ASSEMBLER = config["hybrid_assembler"]
-# MAPPERS = ["bwa", "mmi"]
 ##############################
 # TARGETS & RULES
 # List of targets to be created
@@ -74,11 +69,22 @@ include:
    "workflow/steps/prepare_input.smk"
 # TARGETS.append("status/prepare_input.done")
-# Basecalling
+# Preprocessing
 if "preprocessing" in STEPS:
    include:
        "workflow/steps/preprocessing.smk"
-    TARGETS.append("status/preprocessing_lr.done")
+    TARGETS += [
+        "status/preprocessing_lr.done",
+        "status/preprocessing_sr.done"
+    ]
+# Assembly
+if "assembly" in STEPS:
+    include:
+        "workflow/steps/assembly.smk"
+    TARGETS += [
+        "status/assembly.done"
+    ]
 # # Assembly annotation
 # if 'assembly_annotation' in STEPS:

--- a/config/config.yaml
+++ b/config/config.yaml
@@ -3,7 +3,7 @@
 # Pipeline steps
 # steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"]
-steps: ["preprocessing"]
+steps: ["preprocessing", "assembly"]
 # Analysis sub-steps
 analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
@@ -15,7 +15,7 @@ analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
 # work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
 work_dir: "/scratch/users/vgalata/ont_pilot"
-# paths within the working directory
+# Paths WITHIN the working directory
 # directory containing required DBs
 db_dir: "dbs"
 # results directory
@@ -38,16 +38,14 @@ data:
        # List of FAST5 files
        files: []
-binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
+# binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
 ############################################################
 # TOOLS
-# Basecalling
+# Preprocessing: LR: Basecalling
 guppy:
-    config:
+    config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
-        methylation_aware: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
-        # not_methylation_aware: "dna_r9.4.1_450bps_hac.cfg"
    # cpu:
    #     path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
    #     bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
@@ -63,8 +61,7 @@ guppy:
        num_callers: 4
        runners_per_device: 2
        gpu_device: "cuda:0"
-        # threads: 28
+        threads: 20
-        threads: 5
    # barcoder:
    #     path: "/home/users/sbusi/apps/ont-guppy/bin"
    #     bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
@@ -72,132 +69,147 @@ guppy:
    #     records_per_fastq: 8000
    #     threads: 8
-# assemblers: ["flye"]
+# Preprocessing: SR
-assemblers:
-    sr: ["megahit", "metaspades"]
-    lr: ["flye"]
-    hy: ["metaspadeshybrid"]
-p7zip:
-    bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
-    threads: 4
-ont_fast5_api:
-    single_to_multi_fast5:
-        bin: "single_to_multi_fast5"
-        batch: 8000
-        threads: 8
-nanostats:
 fastp:
+    threads: 10
    min_length: 40
-minimap2:
+# QC
-    threads: 16
+fastqc:
+    threads: 10
-igc:
+    params: "-q -f fastq"
-    uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
-hg38:
-    uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
-genomecov:
-    bin: "bedtools genomecov"
-compute_avg_coverage:
-    bin: "scripts/coverage.awk"
-bwa:
-    threads: 24
-    long_reads_index:
-        opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
-samtools:
+# Assembly
-    sort:
+assemblers:
-        threads: 4
+    sr: ["megahit", "metaspades"]
-        chunk_size: "4G"
+    lr: ["flye"]
-    view:
+    hy: ["metaspadeshybrid"]
-        threads: 4
 flye:
-    bin: "flye"
+    threads: 10
-    threads: 27
    genome_size: "1g"
-operams:
+metaspades:
-    bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
+    threads: 10
-    threads: 28
 megahit:
-    threads: 28
+    threads: 10
-nonpareil:
+# p7zip:
-    memory: 4096
+#     bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
-    threads: 14
+#     threads: 4
+# ont_fast5_api:
-medaka:
+#     single_to_multi_fast5:
-    threads: 28
+#         bin: "single_to_multi_fast5"
+#         batch: 8000
-racon:
+#         threads: 8
-    threads: 28
+# minimap2:
-rebaler:
+#     threads: 16
-    threads: 28
+# igc:
-diamond:
+#     uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
-    threads: 28
-    #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
+# hg38:
-    db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
+#     uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
-metaspades:
+# genomecov:
-    threads: 28
+#     bin: "bedtools genomecov"
-mmseq2:
+# compute_avg_coverage:
-    threads: 24
+#     bin: "scripts/coverage.awk"
-# Hybrid assembler
+# bwa:
-hybrid_assembler: "metaspades_hybrid"
+#     threads: 24
+#     long_reads_index:
-# Number of cpus or threads to use
+#         opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
-threads: 28
+# samtools:
-# kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
+#     sort:
-kraken2:
+#         threads: 4
-    db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
+#         chunk_size: "4G"
+#     view:
-# Binning
+#         threads: 4
-DAS_Tool:
-    path: "/home/users/sbusi/apps/DAS_Tool-master"
+# flye:
-    bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
+#     bin: "flye"
-    db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
+#     threads: 27
-    Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
+#     genome_size: "1g"
-# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
-# dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
+# operams:
+#     bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
-# XXX
+#     threads: 28
-GTDBTK:
-    DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
+# megahit:
+#     threads: 28
-# XXX
-mmseqs:
+# nonpareil:
-    path: "/home/users/sbusi/apps/mmseqs/bin"
+#     memory: 4096
-    createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
+#     threads: 14
-    rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
-    convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
+# medaka:
+#     threads: 28
-# CRISPR
-CASC:
+# racon:
-    PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
+#     threads: 28
-    PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
-minced:
+# rebaler:
-    PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
+#     threads: 28
-# Plasmid prediction
+# diamond:
-plasflow:
+#     threads: 28
-    threshold: 0.7 # class. prob. threshold
+#     #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
-    minlen: 1000 # rm contigs with length below this threshold
+#     db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
-# AMR prediction
+# metaspades:
-rgi:
+#     threads: 28
-    db_url: "https://card.mcmaster.ca/latest/data"
-    alignment_tool: "DIAMOND" # DIAMOND or BLAST
+# mmseq2:
+#     threads: 24
+# # Hybrid assembler
+# hybrid_assembler: "metaspades_hybrid"
+# # Number of cpus or threads to use
+# threads: 28
+# # kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
+# kraken2:
+#     db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
+# # Binning
+# DAS_Tool:
+#     path: "/home/users/sbusi/apps/DAS_Tool-master"
+#     bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
+#     db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
+#     Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
+# # Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
+# # dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
+# # XXX
+# GTDBTK:
+#     DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
+# # XXX
+# mmseqs:
+#     path: "/home/users/sbusi/apps/mmseqs/bin"
+#     createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
+#     rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
+#     convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
+# # CRISPR
+# CASC:
+#     PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
+#     PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
+# minced:
+#     PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
+# # Plasmid prediction
+# plasflow:
+#     threshold: 0.7 # class. prob. threshold
+#     minlen: 1000 # rm contigs with length below this threshold
+# # AMR prediction
+# rgi:
+#     db_url: "https://card.mcmaster.ca/latest/data"
+#     alignment_tool: "DIAMOND" # DIAMOND or BLAST
--- a/config/slurm.yaml
+++ b/config/slurm.yaml
 __default__:
-    time: "2-00:00:00"
+    time: "0-02:00:00"
    partition: "batch"
    qos: "qos-batch"
    nodes: 1
    n: 1
-    # ncpus: 1
+    explicit: ""
-    job-name: "ONT_pilot.{rule}"
+    # job-name: "ONT_pilot.{rule}"
    # output: "slurm-%j.%N-%x.out"
    # error: "slurm-%j.%N-%x.err"
-    explicit: ""
    # mail-type: "end"
+# Preprocessing
 guppy_gpu_basecalling:
    time: "01-00:00:00"
    partition: "gpu"
    qos: "qos-gpu"
    nodes: 1
    n: 1
-    # ncpus: 28
    explicit: "--gres=gpu:1"
-#     "run_fastp_on_short_reads":
+fastp_sr:
-#     {
+    time: "00-04:00:00"
-#         "time": "00-04:00:00",
+    partition: "batch"
-#         "n": 1,
+    qos: "qos-batch"
-#         "ncpus": 3,
+    nodes: 1
-#         "partition": "batch",
+    n: 1
-#         "qos": "qos-batch",
+    explicit: ""
-#         "mail-type": "ALL"
-#     },
+# Assembly
+assembly_lr_flye:
+    time: "01-00:00:00"
+    partition: "bigmem"
+    qos: "qos-bigmem"
+    nodes: 1
+    n: 1
+    explicit: ""
+assembly_sr_megahit:
+    time: "01-00:00:00"
+    partition: "bigmem"
+    qos: "qos-bigmem"
+    nodes: 1
+    n: 1
+    explicit: ""
+assembly_sr_metaspades:
+    time: "01-00:00:00"
+    partition: "bigmem"
+    qos: "qos-bigmem"
+    nodes: 1
+    n: 1
+    explicit: ""
+assemble_hy_metaspades:
+    time: "01-00:00:00"
+    partition: "bigmem"
+    qos: "qos-bigmem"
+    nodes: 1
+    n: 1
+    explicit: ""
 #     "mmseq2_compare":
 #     {
 #         "n": 1,

--- a/sbatch.sh
+++ b/sbatch.sh
@@ -18,7 +18,7 @@
 # conda env name
 ONTP_ENV="ONT_pilot"
 # number of cores for snakemake
-ONTP_CORES=30
+ONTP_CORES=10
 # IMP config file
 ONTP_CONFIG="config/config.yaml" # USER INPUT REQUIRED
 # slurm config file

--- a/workflow/rules/assembly.smk
+++ b/workflow/rules/assembly.smk
+# Assembly
+# Long reads
+rule assembly_lr_flye:
+    input:
+        os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
+    output:
+        os.path.join(RESULTS_DIR, "assembly/lr/flye/assembly.fna")
+    threads:
+        config["flye"]["threads"]
+    log:
+        out="logs/assembly_lr.flye.out.log",
+        err="logs/assembly_lr.flye.err.log"
+    conda:
+        "../envs/flye_v2_7.yaml"
+    message:
+        "Assembly: long reads: Flye"
+    shell:
+        "(date && flye --nano-raw {input} --meta --out-dir $(dirname {output}) --genome-size {config[flye][genome_size]} --threads {threads} && date) 2> {log.err} > {log.out} && "
+        "cd $(dirname {output}) && ln -sf assembly.fasta $(basename {output})"
+# Short reads
+rule assembly_sr_megahit:
+    input: 
+        r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
+        r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
+    output:
+        os.path.join(RESULTS_DIR, "assembly/sr/megahit/assembly.fna")
+    log:
+        out="logs/assembly_sr.megahit.out.log",
+        err="logs/assembly_sr.megahit.err.log"
+    threads:
+        config["megahit"]["threads"]
+    conda:
+        os.path.join(ENV_DIR, "megahit.yaml")
+    message:
+        "Assembly: short reads: MEGAHIT"
+    shell:
+        "(date && megahit -1 {input.r1} -2 {input.r2} -t {threads} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
+        "cd $(dirname {output}) && ln -sf final.contigs.fa $(basename {output})"
+rule assembly_sr_metaspades:
+    input:
+        r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
+        r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
+    output:
+        os.path.join(RESULTS_DIR, "assembly/sr/metaspades/assembly.fna")
+    log:
+        out="logs/assembly_sr.metaspades.out.log",
+        err="logs/assembly_sr.metaspades.err.log"
+    threads:
+        config["metaspades"]["threads"]
+    conda:
+        os.path.join(ENV_DIR, "spades.yaml")
+    message:
+        "Assembly: short reads: MetaSPAdes"
+    shell:
+        "(date && metaspades.py -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
+        "cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
+# Hybrid
+rule assemble_hy_metaspades:
+    input: 
+        lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"),
+        r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
+        r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
+    output: 
+        os.path.join(RESULTS_DIR, "assembly/hy/metaspadeshybrid/assembly.fna")
+    log:
+        out="logs/assembly_hy.metaspades.out.log",
+        err="logs/assembly_hy.metaspades.err.log"
+    threads:
+        config["metaspades"]["threads"]
+    conda:
+        os.path.join(ENV_DIR, "spades.yaml")
+    message:
+        "Assembly: hybrid: MetaSPAdes"
+    shell:
+        "(date && spades.py --meta -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} --nanopore {input.lr} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
+        "cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
--- a/workflow/rules/preprocessing_lr.smk
+++ b/workflow/rules/preprocessing_lr.smk
+# Preprocessing of long reads
 # Basecalling
-# Guppy
 checkpoint guppy_gpu_basecalling:
    input:
        DATA_FAST5
    output:
-        directory(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints"))
+        directory(os.path.join(RESULTS_DIR, "preproc/lr/checkpoints"))
    log:
-        out="logs/basecalling.{guppy_config}.out.log",
+        out="logs/preproc_lr.guppy.out.log",
-        err="logs/basecalling.{guppy_config}.err.log"
+        err="logs/preproc_lr.guppy.err.log"
-    wildcard_constraints:
-        guppy_config="|".join(config["guppy"]["config"].keys())
    threads:
        config["guppy"]["gpu"]["threads"]
-    params:
+    message:
-        config=lambda wildcards: config["guppy"]["config"][wildcards.guppy_config]
+        "Preprocessing long reads: Basecalling w/ Guppy"
    shell:
        """
        (date && \
        {config[guppy][gpu][bin]} --input_path $(dirname {input[0]}) --save_path $(dirname {output}) \
-        --config {params.config} \
+        --config config[guppy][config] \
        --disable_pings --compress_fastq \
        --cpu_threads_per_caller {threads} \
        -x {config[guppy][gpu][gpu_device]} \
@@ -33,8 +32,7 @@ checkpoint guppy_gpu_basecalling:
 def aggregate_guppy_basecalling(wildcards):
    checkpoint_output = checkpoints.guppy_gpu_basecalling.get(**wildcards).output[0]
    return expand(
-        os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
+        os.path.join(RESULTS_DIR, "preproc/lr/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
-        guppy_config=wildcards.guppy_config,
        runid_i_j=glob_wildcards(os.path.join(checkpoint_output, "fastq_runid_{runid_i_j}.fastq.gz")).runid_i_j,
    )
@@ -42,22 +40,24 @@ rule merge_guppy_basecalling:
    input:
        aggregate_guppy_basecalling
    output:
-        os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz")
+        os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
-    wildcard_constraints:
+    message:
-        guppy_config="|".join(config["guppy"]["config"].keys())
+        "Preprocessing long reads: Cat FASTQ"
    shell:
        "cat $(echo \"{input}\" | sort) > {output}"
 # QC
-rule nanostat:
+rule nanostat_guppy_basecalling:
    input:
-        os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz")
+        os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
    output:
-        os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt")
+        os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt")
    log:
-        out="logs/nanostats.{guppy_config}.out.log",
+        out="logs/preproc_lr.nanostats.out.log",
-        err="logs/nanostats.{guppy_config}.err.log"
+        err="logs/preproc_lr.nanostats.err.log"
    conda:
        os.path.join(ENV_DIR, "nanostat.yaml")
+    message:
+        "Preprocessing long reads: NanoStats"
    shell:
        "(date && NanoStat --fastq {input} --outdir $(dirname {output}) -n $(basename {output}) && date) 2> {log.err} > {log.out}"
--- a/workflow/rules/preprocessing_sr.smk
+++ b/workflow/rules/preprocessing_sr.smk
+# Preprocessing of short reads
+# Preprocess the short reads using fastp
+rule fastp_sr:
+    input:
+        r1=DATA_SR["r1"],
+        r2=DATA_SR["r2"]
+    output:
+        r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
+        r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz"),
+        html=os.path.join(RESULTS_DIR, "preproc/sr/fastp.html"),
+        json=os.path.join(RESULTS_DIR, "preproc/sr/fastp.json")
+    log:
+        out="logs/preproc_sr.fastp.out.log",
+        err="logs/preproc_sr.fastp.err.log"
+    threads:
+        config["fastp"]["threads"]
+    conda:
+        os.path.join(ENV_DIR, "fastp.yaml")
+    message:
+        "Preprocessing short reads: FastP"
+    shell:
+        "(date && fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} -w {threads} && date) 2> {log.err} > {log.out}"
+rule fastqc_fastp_sr:
+    input:
+        os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz")
+    output:
+        html=os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_fastqc.html"),
+        zip=os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_fastqc.zip")
+    log:
+        out="logs/preproc_sr.fastqc.{rid}.out.log",
+        err="logs/preproc_sr.fastqc.{rid}.err.log"
+    wildcard_constraints:
+        rid="|".join(["R1", "R2"])
+    threads:
+        config["fastqc"]["threads"]
+    conda:
+        os.path.join(ENV_DIR, "fastqc.yaml")
+    message:
+        "Preprocessing short reads: FastQC"
+    shell:
+        "(date && fastqc {config[fastqc][params]} -t {threads} -o $(dirname {output.html}) {input} && date) 2> {log.err} > {log.out}"
--- a/workflow/steps/assembly.smk
+++ b/workflow/steps/assembly.smk
+# Assembly
+include:
+    '../rules/assembly.smk'
+# NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`.
+#       This is needed so that an email can be sent upon event changes for this rule.
+rule ASSEMBLY:
+    input:
+       lr=expand(os.path.join(RESULTS_DIR, "assembly/lr/{tool}/assembly.fna"), tool=config["assemblers"]["lr"]),
+       sr=expand(os.path.join(RESULTS_DIR, "assembly/sr/{tool}/assembly.fna"), tool=config["assemblers"]["sr"]),
+       hy=expand(os.path.join(RESULTS_DIR, "assembly/hy/{tool}/assembly.fna"), tool=config["assemblers"]["hy"])
+    output:
+        "status/assembly.done"
+    shell:
+        "touch {output}"
\ No newline at end of file
--- a/workflow/steps/preprocessing.smk
+++ b/workflow/steps/preprocessing.smk
 # Preprocessing reads before de novo assembly
 include:
-    '../rules/preprocessing_lr.smk',
+    '../rules/preprocessing_lr.smk'
+include:
    '../rules/preprocessing_sr.smk'
 # NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`.
@@ -9,9 +10,18 @@ include:
 rule PREPROCESSING_LR:
    input:
-        basecalling=expand(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz"), guppy_config=config["guppy"]["config"].keys()),
+        basecalling=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"),
-        nanostats=expand(os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt"), guppy_config=config["guppy"]["config"].keys())
+        nanostats=os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt")
    output:
        "status/preprocessing_lr.done"
    shell:
        "touch {output}"
+rule PREPROCESSING_SR:
+    input:
+        fastp=expand(os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz"), rid=["R1", "R2"]),
+        fastqc=expand(os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_{ext}"), rid=["R1", "R2"], ext=["fastqc.html", "fastqc.zip"])
+    output:
+        "status/preprocessing_sr.done"
+    shell:
+        "touch {output}"