diff --git a/Snakefile b/Snakefile index 80707454d7dec82692bc47cdee257d2b38680b00..d40c0e7402b49421f5315433701fb83ffc3bc717 100755 --- a/Snakefile +++ b/Snakefile @@ -39,7 +39,7 @@ INPUT_FAST5 = [] INPUT_FAST5 += [os.path.abspath(f) for f in config["data"]["ont"]["files"]] for ont_d in config["data"]["ont"]["dirs"]: INPUT_FAST5 += [os.path.abspath(f) for f in Path(ont_d).rglob('*.fast5')] -INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST +# INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST # SR files INPUT_SR = config["data"]["sr"] # MetaT files @@ -49,19 +49,14 @@ INPUT_METAT = config["data"]["metat"] DATA_FAST5 = [os.path.join(RESULTS_DIR, "input_ont", os.path.basename(f)) for f in INPUT_FAST5] assert len(DATA_FAST5) == len(set(DATA_FAST5)), "Created link names for FAST5 files are NOT unique: {}".format(DATA_FAST5) DATA_SR = { - "r1": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r1"])), - "r2": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r2"])) + "r1": os.path.join(RESULTS_DIR, "input_sr/R1.fq.gz"), + "r2": os.path.join(RESULTS_DIR, "input_sr/R2.fq.gz") } DATA_METAT = { - "r1": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r1"])), - "r2": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r2"])) + "r1": os.path.join(RESULTS_DIR, "input_metat/R1.fq.gz"), + "r2": os.path.join(RESULTS_DIR, "input_metat/R2.fq.gz") } -# # Tools -# ASSEMBLERS = config["assemblers"] -# HYBRID_ASSEMBLER = config["hybrid_assembler"] -# MAPPERS = ["bwa", "mmi"] - ############################## # TARGETS & RULES # List of targets to be created @@ -74,11 +69,22 @@ include: "workflow/steps/prepare_input.smk" # TARGETS.append("status/prepare_input.done") -# Basecalling +# Preprocessing if "preprocessing" in STEPS: include: "workflow/steps/preprocessing.smk" - TARGETS.append("status/preprocessing_lr.done") + TARGETS += [ + "status/preprocessing_lr.done", + "status/preprocessing_sr.done" + ] + +# Assembly +if "assembly" in STEPS: + include: + "workflow/steps/assembly.smk" + TARGETS += [ + "status/assembly.done" + ] # # Assembly annotation # if 'assembly_annotation' in STEPS: diff --git a/config/config.yaml b/config/config.yaml index fcd77728ecbdd369759213ceb27983f0bc90cddb..c665b542155651b1dfce46ecc22f822195c2566b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,7 +3,7 @@ # Pipeline steps # steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"] -steps: ["preprocessing"] +steps: ["preprocessing", "assembly"] # Analysis sub-steps analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"] @@ -15,7 +15,7 @@ analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"] # work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB" work_dir: "/scratch/users/vgalata/ont_pilot" -# paths within the working directory +# Paths WITHIN the working directory # directory containing required DBs db_dir: "dbs" # results directory @@ -38,16 +38,14 @@ data: # List of FAST5 files files: [] -binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"] +# binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"] ############################################################ # TOOLS -# Basecalling +# Preprocessing: LR: Basecalling guppy: - config: - methylation_aware: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg" - # not_methylation_aware: "dna_r9.4.1_450bps_hac.cfg" + config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg" # cpu: # path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin" # bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller" @@ -63,8 +61,7 @@ guppy: num_callers: 4 runners_per_device: 2 gpu_device: "cuda:0" - # threads: 28 - threads: 5 + threads: 20 # barcoder: # path: "/home/users/sbusi/apps/ont-guppy/bin" # bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder" @@ -72,132 +69,147 @@ guppy: # records_per_fastq: 8000 # threads: 8 -# assemblers: ["flye"] -assemblers: - sr: ["megahit", "metaspades"] - lr: ["flye"] - hy: ["metaspadeshybrid"] - -p7zip: - bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za" - threads: 4 -ont_fast5_api: - single_to_multi_fast5: - bin: "single_to_multi_fast5" - batch: 8000 - threads: 8 - -nanostats: - +# Preprocessing: SR fastp: + threads: 10 min_length: 40 -minimap2: - threads: 16 - -igc: - uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz" - -hg38: - uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz" - -genomecov: - bin: "bedtools genomecov" - -compute_avg_coverage: - bin: "scripts/coverage.awk" - -bwa: - threads: 24 - long_reads_index: - opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5" +# QC +fastqc: + threads: 10 + params: "-q -f fastq" -samtools: - sort: - threads: 4 - chunk_size: "4G" - view: - threads: 4 +# Assembly +assemblers: + sr: ["megahit", "metaspades"] + lr: ["flye"] + hy: ["metaspadeshybrid"] flye: - bin: "flye" - threads: 27 + threads: 10 genome_size: "1g" -operams: - bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl" - threads: 28 +metaspades: + threads: 10 megahit: - threads: 28 - -nonpareil: - memory: 4096 - threads: 14 - -medaka: - threads: 28 - -racon: - threads: 28 - -rebaler: - threads: 28 - -diamond: - threads: 28 - #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd" - db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd" - -metaspades: - threads: 28 - -mmseq2: - threads: 24 - -# Hybrid assembler -hybrid_assembler: "metaspades_hybrid" - -# Number of cpus or threads to use -threads: 28 - -# kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/" -kraken2: - db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/" - -# Binning -DAS_Tool: - path: "/home/users/sbusi/apps/DAS_Tool-master" - bin: "/home/users/sbusi/apps/DAS_Tool-master/src/" - db: "/home/users/sbusi/apps/DAS_Tool-master/db/" - Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/" -# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/" -# dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/" - -# XXX -GTDBTK: - DATA: "/home/users/sbusi/apps/db/gtdbtk/release89" - -# XXX -mmseqs: - path: "/home/users/sbusi/apps/mmseqs/bin" - createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb" - rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh" - convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis" - -# CRISPR -CASC: - PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin" - PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl" -minced: - PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/" - -# Plasmid prediction -plasflow: - threshold: 0.7 # class. prob. threshold - minlen: 1000 # rm contigs with length below this threshold - -# AMR prediction -rgi: - db_url: "https://card.mcmaster.ca/latest/data" - alignment_tool: "DIAMOND" # DIAMOND or BLAST + threads: 10 + +# p7zip: +# bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za" +# threads: 4 +# ont_fast5_api: +# single_to_multi_fast5: +# bin: "single_to_multi_fast5" +# batch: 8000 +# threads: 8 + +# minimap2: +# threads: 16 + +# igc: +# uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz" + +# hg38: +# uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz" + +# genomecov: +# bin: "bedtools genomecov" + +# compute_avg_coverage: +# bin: "scripts/coverage.awk" + +# bwa: +# threads: 24 +# long_reads_index: +# opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5" + +# samtools: +# sort: +# threads: 4 +# chunk_size: "4G" +# view: +# threads: 4 + +# flye: +# bin: "flye" +# threads: 27 +# genome_size: "1g" + +# operams: +# bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl" +# threads: 28 + +# megahit: +# threads: 28 + +# nonpareil: +# memory: 4096 +# threads: 14 + +# medaka: +# threads: 28 + +# racon: +# threads: 28 + +# rebaler: +# threads: 28 + +# diamond: +# threads: 28 +# #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd" +# db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd" + +# metaspades: +# threads: 28 + +# mmseq2: +# threads: 24 + +# # Hybrid assembler +# hybrid_assembler: "metaspades_hybrid" + +# # Number of cpus or threads to use +# threads: 28 + +# # kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/" +# kraken2: +# db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/" + +# # Binning +# DAS_Tool: +# path: "/home/users/sbusi/apps/DAS_Tool-master" +# bin: "/home/users/sbusi/apps/DAS_Tool-master/src/" +# db: "/home/users/sbusi/apps/DAS_Tool-master/db/" +# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/" +# # Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/" +# # dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/" + +# # XXX +# GTDBTK: +# DATA: "/home/users/sbusi/apps/db/gtdbtk/release89" + +# # XXX +# mmseqs: +# path: "/home/users/sbusi/apps/mmseqs/bin" +# createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb" +# rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh" +# convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis" + +# # CRISPR +# CASC: +# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin" +# PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl" +# minced: +# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/" + +# # Plasmid prediction +# plasflow: +# threshold: 0.7 # class. prob. threshold +# minlen: 1000 # rm contigs with length below this threshold + +# # AMR prediction +# rgi: +# db_url: "https://card.mcmaster.ca/latest/data" +# alignment_tool: "DIAMOND" # DIAMOND or BLAST diff --git a/config/slurm.yaml b/config/slurm.yaml index 58e8afdb6096b74ce85308ea56a109413d9745bf..187dd61b789b6c3ea7b4fd9b6e0aa1455655400f 100644 --- a/config/slurm.yaml +++ b/config/slurm.yaml @@ -1,34 +1,65 @@ __default__: - time: "2-00:00:00" + time: "0-02:00:00" partition: "batch" qos: "qos-batch" nodes: 1 n: 1 - # ncpus: 1 - job-name: "ONT_pilot.{rule}" + explicit: "" + # job-name: "ONT_pilot.{rule}" # output: "slurm-%j.%N-%x.out" # error: "slurm-%j.%N-%x.err" - explicit: "" # mail-type: "end" +# Preprocessing guppy_gpu_basecalling: time: "01-00:00:00" partition: "gpu" qos: "qos-gpu" nodes: 1 n: 1 - # ncpus: 28 explicit: "--gres=gpu:1" -# "run_fastp_on_short_reads": -# { -# "time": "00-04:00:00", -# "n": 1, -# "ncpus": 3, -# "partition": "batch", -# "qos": "qos-batch", -# "mail-type": "ALL" -# }, +fastp_sr: + time: "00-04:00:00" + partition: "batch" + qos: "qos-batch" + nodes: 1 + n: 1 + explicit: "" + +# Assembly +assembly_lr_flye: + time: "01-00:00:00" + partition: "bigmem" + qos: "qos-bigmem" + nodes: 1 + n: 1 + explicit: "" + +assembly_sr_megahit: + time: "01-00:00:00" + partition: "bigmem" + qos: "qos-bigmem" + nodes: 1 + n: 1 + explicit: "" + +assembly_sr_metaspades: + time: "01-00:00:00" + partition: "bigmem" + qos: "qos-bigmem" + nodes: 1 + n: 1 + explicit: "" + +assemble_hy_metaspades: + time: "01-00:00:00" + partition: "bigmem" + qos: "qos-bigmem" + nodes: 1 + n: 1 + explicit: "" + # "mmseq2_compare": # { # "n": 1, diff --git a/sbatch.sh b/sbatch.sh index 216dc1ac3a028b2b5727af3f7ca7a86eba4328a1..1da03bab7be04e698516afcff953de51b3df24b7 100755 --- a/sbatch.sh +++ b/sbatch.sh @@ -18,7 +18,7 @@ # conda env name ONTP_ENV="ONT_pilot" # number of cores for snakemake -ONTP_CORES=30 +ONTP_CORES=10 # IMP config file ONTP_CONFIG="config/config.yaml" # USER INPUT REQUIRED # slurm config file diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk new file mode 100644 index 0000000000000000000000000000000000000000..df2ef57568fc7a955667d40d580f68dcc4fb5c5b --- /dev/null +++ b/workflow/rules/assembly.smk @@ -0,0 +1,80 @@ +# Assembly + +# Long reads +rule assembly_lr_flye: + input: + os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz") + output: + os.path.join(RESULTS_DIR, "assembly/lr/flye/assembly.fna") + threads: + config["flye"]["threads"] + log: + out="logs/assembly_lr.flye.out.log", + err="logs/assembly_lr.flye.err.log" + conda: + "../envs/flye_v2_7.yaml" + message: + "Assembly: long reads: Flye" + shell: + "(date && flye --nano-raw {input} --meta --out-dir $(dirname {output}) --genome-size {config[flye][genome_size]} --threads {threads} && date) 2> {log.err} > {log.out} && " + "cd $(dirname {output}) && ln -sf assembly.fasta $(basename {output})" + +# Short reads +rule assembly_sr_megahit: + input: + r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz") + output: + os.path.join(RESULTS_DIR, "assembly/sr/megahit/assembly.fna") + log: + out="logs/assembly_sr.megahit.out.log", + err="logs/assembly_sr.megahit.err.log" + threads: + config["megahit"]["threads"] + conda: + os.path.join(ENV_DIR, "megahit.yaml") + message: + "Assembly: short reads: MEGAHIT" + shell: + "(date && megahit -1 {input.r1} -2 {input.r2} -t {threads} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && " + "cd $(dirname {output}) && ln -sf final.contigs.fa $(basename {output})" + +rule assembly_sr_metaspades: + input: + r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz") + output: + os.path.join(RESULTS_DIR, "assembly/sr/metaspades/assembly.fna") + log: + out="logs/assembly_sr.metaspades.out.log", + err="logs/assembly_sr.metaspades.err.log" + threads: + config["metaspades"]["threads"] + conda: + os.path.join(ENV_DIR, "spades.yaml") + message: + "Assembly: short reads: MetaSPAdes" + shell: + "(date && metaspades.py -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && " + "cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})" + +# Hybrid +rule assemble_hy_metaspades: + input: + lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"), + r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz") + output: + os.path.join(RESULTS_DIR, "assembly/hy/metaspadeshybrid/assembly.fna") + log: + out="logs/assembly_hy.metaspades.out.log", + err="logs/assembly_hy.metaspades.err.log" + threads: + config["metaspades"]["threads"] + conda: + os.path.join(ENV_DIR, "spades.yaml") + message: + "Assembly: hybrid: MetaSPAdes" + shell: + "(date && spades.py --meta -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} --nanopore {input.lr} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && " + "cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})" diff --git a/workflow/rules/preprocessing_lr.smk b/workflow/rules/preprocessing_lr.smk index 04cebe6af50269a5a45ef24ee278ec8c8781257f..7f0af114b60780ee7faa4bfa7587effca350cd07 100644 --- a/workflow/rules/preprocessing_lr.smk +++ b/workflow/rules/preprocessing_lr.smk @@ -1,24 +1,23 @@ +# Preprocessing of long reads + # Basecalling -# Guppy checkpoint guppy_gpu_basecalling: input: DATA_FAST5 output: - directory(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints")) + directory(os.path.join(RESULTS_DIR, "preproc/lr/checkpoints")) log: - out="logs/basecalling.{guppy_config}.out.log", - err="logs/basecalling.{guppy_config}.err.log" - wildcard_constraints: - guppy_config="|".join(config["guppy"]["config"].keys()) + out="logs/preproc_lr.guppy.out.log", + err="logs/preproc_lr.guppy.err.log" threads: config["guppy"]["gpu"]["threads"] - params: - config=lambda wildcards: config["guppy"]["config"][wildcards.guppy_config] + message: + "Preprocessing long reads: Basecalling w/ Guppy" shell: """ (date && \ {config[guppy][gpu][bin]} --input_path $(dirname {input[0]}) --save_path $(dirname {output}) \ - --config {params.config} \ + --config config[guppy][config] \ --disable_pings --compress_fastq \ --cpu_threads_per_caller {threads} \ -x {config[guppy][gpu][gpu_device]} \ @@ -33,8 +32,7 @@ checkpoint guppy_gpu_basecalling: def aggregate_guppy_basecalling(wildcards): checkpoint_output = checkpoints.guppy_gpu_basecalling.get(**wildcards).output[0] return expand( - os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"), - guppy_config=wildcards.guppy_config, + os.path.join(RESULTS_DIR, "preproc/lr/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"), runid_i_j=glob_wildcards(os.path.join(checkpoint_output, "fastq_runid_{runid_i_j}.fastq.gz")).runid_i_j, ) @@ -42,22 +40,24 @@ rule merge_guppy_basecalling: input: aggregate_guppy_basecalling output: - os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz") - wildcard_constraints: - guppy_config="|".join(config["guppy"]["config"].keys()) + os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz") + message: + "Preprocessing long reads: Cat FASTQ" shell: "cat $(echo \"{input}\" | sort) > {output}" # QC -rule nanostat: +rule nanostat_guppy_basecalling: input: - os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz") + os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz") output: - os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt") + os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt") log: - out="logs/nanostats.{guppy_config}.out.log", - err="logs/nanostats.{guppy_config}.err.log" + out="logs/preproc_lr.nanostats.out.log", + err="logs/preproc_lr.nanostats.err.log" conda: os.path.join(ENV_DIR, "nanostat.yaml") + message: + "Preprocessing long reads: NanoStats" shell: "(date && NanoStat --fastq {input} --outdir $(dirname {output}) -n $(basename {output}) && date) 2> {log.err} > {log.out}" diff --git a/workflow/rules/preprocessing_sr.smk b/workflow/rules/preprocessing_sr.smk new file mode 100644 index 0000000000000000000000000000000000000000..50ed0b9ee105056b32b0da44ca107e7a7273ee99 --- /dev/null +++ b/workflow/rules/preprocessing_sr.smk @@ -0,0 +1,43 @@ +# Preprocessing of short reads + +# Preprocess the short reads using fastp +rule fastp_sr: + input: + r1=DATA_SR["r1"], + r2=DATA_SR["r2"] + output: + r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz"), + html=os.path.join(RESULTS_DIR, "preproc/sr/fastp.html"), + json=os.path.join(RESULTS_DIR, "preproc/sr/fastp.json") + log: + out="logs/preproc_sr.fastp.out.log", + err="logs/preproc_sr.fastp.err.log" + threads: + config["fastp"]["threads"] + conda: + os.path.join(ENV_DIR, "fastp.yaml") + message: + "Preprocessing short reads: FastP" + shell: + "(date && fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} -w {threads} && date) 2> {log.err} > {log.out}" + +rule fastqc_fastp_sr: + input: + os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz") + output: + html=os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_fastqc.html"), + zip=os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_fastqc.zip") + log: + out="logs/preproc_sr.fastqc.{rid}.out.log", + err="logs/preproc_sr.fastqc.{rid}.err.log" + wildcard_constraints: + rid="|".join(["R1", "R2"]) + threads: + config["fastqc"]["threads"] + conda: + os.path.join(ENV_DIR, "fastqc.yaml") + message: + "Preprocessing short reads: FastQC" + shell: + "(date && fastqc {config[fastqc][params]} -t {threads} -o $(dirname {output.html}) {input} && date) 2> {log.err} > {log.out}" diff --git a/workflow/steps/assembly.smk b/workflow/steps/assembly.smk new file mode 100644 index 0000000000000000000000000000000000000000..1e7ead6b690f82e352a125e623d9236dca2dbd05 --- /dev/null +++ b/workflow/steps/assembly.smk @@ -0,0 +1,17 @@ +# Assembly + +include: + '../rules/assembly.smk' + +# NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`. +# This is needed so that an email can be sent upon event changes for this rule. + +rule ASSEMBLY: + input: + lr=expand(os.path.join(RESULTS_DIR, "assembly/lr/{tool}/assembly.fna"), tool=config["assemblers"]["lr"]), + sr=expand(os.path.join(RESULTS_DIR, "assembly/sr/{tool}/assembly.fna"), tool=config["assemblers"]["sr"]), + hy=expand(os.path.join(RESULTS_DIR, "assembly/hy/{tool}/assembly.fna"), tool=config["assemblers"]["hy"]) + output: + "status/assembly.done" + shell: + "touch {output}" \ No newline at end of file diff --git a/workflow/steps/preprocessing.smk b/workflow/steps/preprocessing.smk index f4d1ba9ee2373dc739f489a11d0abd7c13664a72..5515086b1607cee165e6183f319e3141943bede2 100644 --- a/workflow/steps/preprocessing.smk +++ b/workflow/steps/preprocessing.smk @@ -1,7 +1,8 @@ # Preprocessing reads before de novo assembly include: - '../rules/preprocessing_lr.smk', + '../rules/preprocessing_lr.smk' +include: '../rules/preprocessing_sr.smk' # NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`. @@ -9,9 +10,18 @@ include: rule PREPROCESSING_LR: input: - basecalling=expand(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz"), guppy_config=config["guppy"]["config"].keys()), - nanostats=expand(os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt"), guppy_config=config["guppy"]["config"].keys()) + basecalling=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"), + nanostats=os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt") output: "status/preprocessing_lr.done" shell: "touch {output}" + +rule PREPROCESSING_SR: + input: + fastp=expand(os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz"), rid=["R1", "R2"]), + fastqc=expand(os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_{ext}"), rid=["R1", "R2"], ext=["fastqc.html", "fastqc.zip"]) + output: + "status/preprocessing_sr.done" + shell: + "touch {output}"