Skip to content
Snippets Groups Projects
Commit 6cbf5ed6 authored by Valentina Galata's avatar Valentina Galata
Browse files

init setup for preprocessing and assembly

parent ab685564
No related branches found
No related tags found
1 merge request!76Merge "cleanup" branch with "master" branch
......@@ -39,7 +39,7 @@ INPUT_FAST5 = []
INPUT_FAST5 += [os.path.abspath(f) for f in config["data"]["ont"]["files"]]
for ont_d in config["data"]["ont"]["dirs"]:
INPUT_FAST5 += [os.path.abspath(f) for f in Path(ont_d).rglob('*.fast5')]
INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
# INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
# SR files
INPUT_SR = config["data"]["sr"]
# MetaT files
......@@ -49,19 +49,14 @@ INPUT_METAT = config["data"]["metat"]
DATA_FAST5 = [os.path.join(RESULTS_DIR, "input_ont", os.path.basename(f)) for f in INPUT_FAST5]
assert len(DATA_FAST5) == len(set(DATA_FAST5)), "Created link names for FAST5 files are NOT unique: {}".format(DATA_FAST5)
DATA_SR = {
"r1": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r1"])),
"r2": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r2"]))
"r1": os.path.join(RESULTS_DIR, "input_sr/R1.fq.gz"),
"r2": os.path.join(RESULTS_DIR, "input_sr/R2.fq.gz")
}
DATA_METAT = {
"r1": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r1"])),
"r2": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r2"]))
"r1": os.path.join(RESULTS_DIR, "input_metat/R1.fq.gz"),
"r2": os.path.join(RESULTS_DIR, "input_metat/R2.fq.gz")
}
# # Tools
# ASSEMBLERS = config["assemblers"]
# HYBRID_ASSEMBLER = config["hybrid_assembler"]
# MAPPERS = ["bwa", "mmi"]
##############################
# TARGETS & RULES
# List of targets to be created
......@@ -74,11 +69,22 @@ include:
"workflow/steps/prepare_input.smk"
# TARGETS.append("status/prepare_input.done")
# Basecalling
# Preprocessing
if "preprocessing" in STEPS:
include:
"workflow/steps/preprocessing.smk"
TARGETS.append("status/preprocessing_lr.done")
TARGETS += [
"status/preprocessing_lr.done",
"status/preprocessing_sr.done"
]
# Assembly
if "assembly" in STEPS:
include:
"workflow/steps/assembly.smk"
TARGETS += [
"status/assembly.done"
]
# # Assembly annotation
# if 'assembly_annotation' in STEPS:
......
......@@ -3,7 +3,7 @@
# Pipeline steps
# steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"]
steps: ["preprocessing"]
steps: ["preprocessing", "assembly"]
# Analysis sub-steps
analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
......@@ -15,7 +15,7 @@ analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
# work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
work_dir: "/scratch/users/vgalata/ont_pilot"
# paths within the working directory
# Paths WITHIN the working directory
# directory containing required DBs
db_dir: "dbs"
# results directory
......@@ -38,16 +38,14 @@ data:
# List of FAST5 files
files: []
binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
# binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
############################################################
# TOOLS
# Basecalling
# Preprocessing: LR: Basecalling
guppy:
config:
methylation_aware: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
# not_methylation_aware: "dna_r9.4.1_450bps_hac.cfg"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
# cpu:
# path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
# bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
......@@ -63,8 +61,7 @@ guppy:
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
# threads: 28
threads: 5
threads: 20
# barcoder:
# path: "/home/users/sbusi/apps/ont-guppy/bin"
# bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
......@@ -72,132 +69,147 @@ guppy:
# records_per_fastq: 8000
# threads: 8
# assemblers: ["flye"]
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye"]
hy: ["metaspadeshybrid"]
p7zip:
bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
threads: 4
ont_fast5_api:
single_to_multi_fast5:
bin: "single_to_multi_fast5"
batch: 8000
threads: 8
nanostats:
# Preprocessing: SR
fastp:
threads: 10
min_length: 40
minimap2:
threads: 16
igc:
uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
hg38:
uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
genomecov:
bin: "bedtools genomecov"
compute_avg_coverage:
bin: "scripts/coverage.awk"
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
# QC
fastqc:
threads: 10
params: "-q -f fastq"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
# Assembly
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye"]
hy: ["metaspadeshybrid"]
flye:
bin: "flye"
threads: 27
threads: 10
genome_size: "1g"
operams:
bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
threads: 28
metaspades:
threads: 10
megahit:
threads: 28
nonpareil:
memory: 4096
threads: 14
medaka:
threads: 28
racon:
threads: 28
rebaler:
threads: 28
diamond:
threads: 28
#db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
metaspades:
threads: 28
mmseq2:
threads: 24
# Hybrid assembler
hybrid_assembler: "metaspades_hybrid"
# Number of cpus or threads to use
threads: 28
# kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
kraken2:
db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# Binning
DAS_Tool:
path: "/home/users/sbusi/apps/DAS_Tool-master"
bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# XXX
GTDBTK:
DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# XXX
mmseqs:
path: "/home/users/sbusi/apps/mmseqs/bin"
createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
# CRISPR
CASC:
PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
minced:
PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
# Plasmid prediction
plasflow:
threshold: 0.7 # class. prob. threshold
minlen: 1000 # rm contigs with length below this threshold
# AMR prediction
rgi:
db_url: "https://card.mcmaster.ca/latest/data"
alignment_tool: "DIAMOND" # DIAMOND or BLAST
threads: 10
# p7zip:
# bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
# threads: 4
# ont_fast5_api:
# single_to_multi_fast5:
# bin: "single_to_multi_fast5"
# batch: 8000
# threads: 8
# minimap2:
# threads: 16
# igc:
# uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
# hg38:
# uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
# genomecov:
# bin: "bedtools genomecov"
# compute_avg_coverage:
# bin: "scripts/coverage.awk"
# bwa:
# threads: 24
# long_reads_index:
# opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
# samtools:
# sort:
# threads: 4
# chunk_size: "4G"
# view:
# threads: 4
# flye:
# bin: "flye"
# threads: 27
# genome_size: "1g"
# operams:
# bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
# threads: 28
# megahit:
# threads: 28
# nonpareil:
# memory: 4096
# threads: 14
# medaka:
# threads: 28
# racon:
# threads: 28
# rebaler:
# threads: 28
# diamond:
# threads: 28
# #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
# db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
# metaspades:
# threads: 28
# mmseq2:
# threads: 24
# # Hybrid assembler
# hybrid_assembler: "metaspades_hybrid"
# # Number of cpus or threads to use
# threads: 28
# # kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# kraken2:
# db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# # Binning
# DAS_Tool:
# path: "/home/users/sbusi/apps/DAS_Tool-master"
# bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
# db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# # XXX
# GTDBTK:
# DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# # XXX
# mmseqs:
# path: "/home/users/sbusi/apps/mmseqs/bin"
# createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
# rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
# convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
# # CRISPR
# CASC:
# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
# PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
# minced:
# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
# # Plasmid prediction
# plasflow:
# threshold: 0.7 # class. prob. threshold
# minlen: 1000 # rm contigs with length below this threshold
# # AMR prediction
# rgi:
# db_url: "https://card.mcmaster.ca/latest/data"
# alignment_tool: "DIAMOND" # DIAMOND or BLAST
__default__:
time: "2-00:00:00"
time: "0-02:00:00"
partition: "batch"
qos: "qos-batch"
nodes: 1
n: 1
# ncpus: 1
job-name: "ONT_pilot.{rule}"
explicit: ""
# job-name: "ONT_pilot.{rule}"
# output: "slurm-%j.%N-%x.out"
# error: "slurm-%j.%N-%x.err"
explicit: ""
# mail-type: "end"
# Preprocessing
guppy_gpu_basecalling:
time: "01-00:00:00"
partition: "gpu"
qos: "qos-gpu"
nodes: 1
n: 1
# ncpus: 28
explicit: "--gres=gpu:1"
# "run_fastp_on_short_reads":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 3,
# "partition": "batch",
# "qos": "qos-batch",
# "mail-type": "ALL"
# },
fastp_sr:
time: "00-04:00:00"
partition: "batch"
qos: "qos-batch"
nodes: 1
n: 1
explicit: ""
# Assembly
assembly_lr_flye:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assembly_sr_megahit:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assembly_sr_metaspades:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assemble_hy_metaspades:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
# "mmseq2_compare":
# {
# "n": 1,
......
......@@ -18,7 +18,7 @@
# conda env name
ONTP_ENV="ONT_pilot"
# number of cores for snakemake
ONTP_CORES=30
ONTP_CORES=10
# IMP config file
ONTP_CONFIG="config/config.yaml" # USER INPUT REQUIRED
# slurm config file
......
# Assembly
# Long reads
rule assembly_lr_flye:
input:
os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/lr/flye/assembly.fna")
threads:
config["flye"]["threads"]
log:
out="logs/assembly_lr.flye.out.log",
err="logs/assembly_lr.flye.err.log"
conda:
"../envs/flye_v2_7.yaml"
message:
"Assembly: long reads: Flye"
shell:
"(date && flye --nano-raw {input} --meta --out-dir $(dirname {output}) --genome-size {config[flye][genome_size]} --threads {threads} && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf assembly.fasta $(basename {output})"
# Short reads
rule assembly_sr_megahit:
input:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/sr/megahit/assembly.fna")
log:
out="logs/assembly_sr.megahit.out.log",
err="logs/assembly_sr.megahit.err.log"
threads:
config["megahit"]["threads"]
conda:
os.path.join(ENV_DIR, "megahit.yaml")
message:
"Assembly: short reads: MEGAHIT"
shell:
"(date && megahit -1 {input.r1} -2 {input.r2} -t {threads} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf final.contigs.fa $(basename {output})"
rule assembly_sr_metaspades:
input:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/sr/metaspades/assembly.fna")
log:
out="logs/assembly_sr.metaspades.out.log",
err="logs/assembly_sr.metaspades.err.log"
threads:
config["metaspades"]["threads"]
conda:
os.path.join(ENV_DIR, "spades.yaml")
message:
"Assembly: short reads: MetaSPAdes"
shell:
"(date && metaspades.py -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
# Hybrid
rule assemble_hy_metaspades:
input:
lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"),
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/hy/metaspadeshybrid/assembly.fna")
log:
out="logs/assembly_hy.metaspades.out.log",
err="logs/assembly_hy.metaspades.err.log"
threads:
config["metaspades"]["threads"]
conda:
os.path.join(ENV_DIR, "spades.yaml")
message:
"Assembly: hybrid: MetaSPAdes"
shell:
"(date && spades.py --meta -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} --nanopore {input.lr} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
# Preprocessing of long reads
# Basecalling
# Guppy
checkpoint guppy_gpu_basecalling:
input:
DATA_FAST5
output:
directory(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints"))
directory(os.path.join(RESULTS_DIR, "preproc/lr/checkpoints"))
log:
out="logs/basecalling.{guppy_config}.out.log",
err="logs/basecalling.{guppy_config}.err.log"
wildcard_constraints:
guppy_config="|".join(config["guppy"]["config"].keys())
out="logs/preproc_lr.guppy.out.log",
err="logs/preproc_lr.guppy.err.log"
threads:
config["guppy"]["gpu"]["threads"]
params:
config=lambda wildcards: config["guppy"]["config"][wildcards.guppy_config]
message:
"Preprocessing long reads: Basecalling w/ Guppy"
shell:
"""
(date && \
{config[guppy][gpu][bin]} --input_path $(dirname {input[0]}) --save_path $(dirname {output}) \
--config {params.config} \
--config config[guppy][config] \
--disable_pings --compress_fastq \
--cpu_threads_per_caller {threads} \
-x {config[guppy][gpu][gpu_device]} \
......@@ -33,8 +32,7 @@ checkpoint guppy_gpu_basecalling:
def aggregate_guppy_basecalling(wildcards):
checkpoint_output = checkpoints.guppy_gpu_basecalling.get(**wildcards).output[0]
return expand(
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
guppy_config=wildcards.guppy_config,
os.path.join(RESULTS_DIR, "preproc/lr/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
runid_i_j=glob_wildcards(os.path.join(checkpoint_output, "fastq_runid_{runid_i_j}.fastq.gz")).runid_i_j,
)
......@@ -42,22 +40,24 @@ rule merge_guppy_basecalling:
input:
aggregate_guppy_basecalling
output:
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz")
wildcard_constraints:
guppy_config="|".join(config["guppy"]["config"].keys())
os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
message:
"Preprocessing long reads: Cat FASTQ"
shell:
"cat $(echo \"{input}\" | sort) > {output}"
# QC
rule nanostat:
rule nanostat_guppy_basecalling:
input:
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz")
os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
output:
os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt")
os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt")
log:
out="logs/nanostats.{guppy_config}.out.log",
err="logs/nanostats.{guppy_config}.err.log"
out="logs/preproc_lr.nanostats.out.log",
err="logs/preproc_lr.nanostats.err.log"
conda:
os.path.join(ENV_DIR, "nanostat.yaml")
message:
"Preprocessing long reads: NanoStats"
shell:
"(date && NanoStat --fastq {input} --outdir $(dirname {output}) -n $(basename {output}) && date) 2> {log.err} > {log.out}"
# Preprocessing of short reads
# Preprocess the short reads using fastp
rule fastp_sr:
input:
r1=DATA_SR["r1"],
r2=DATA_SR["r2"]
output:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz"),
html=os.path.join(RESULTS_DIR, "preproc/sr/fastp.html"),
json=os.path.join(RESULTS_DIR, "preproc/sr/fastp.json")
log:
out="logs/preproc_sr.fastp.out.log",
err="logs/preproc_sr.fastp.err.log"
threads:
config["fastp"]["threads"]
conda:
os.path.join(ENV_DIR, "fastp.yaml")
message:
"Preprocessing short reads: FastP"
shell:
"(date && fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} -w {threads} && date) 2> {log.err} > {log.out}"
rule fastqc_fastp_sr:
input:
os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz")
output:
html=os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_fastqc.html"),
zip=os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_fastqc.zip")
log:
out="logs/preproc_sr.fastqc.{rid}.out.log",
err="logs/preproc_sr.fastqc.{rid}.err.log"
wildcard_constraints:
rid="|".join(["R1", "R2"])
threads:
config["fastqc"]["threads"]
conda:
os.path.join(ENV_DIR, "fastqc.yaml")
message:
"Preprocessing short reads: FastQC"
shell:
"(date && fastqc {config[fastqc][params]} -t {threads} -o $(dirname {output.html}) {input} && date) 2> {log.err} > {log.out}"
# Assembly
include:
'../rules/assembly.smk'
# NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`.
# This is needed so that an email can be sent upon event changes for this rule.
rule ASSEMBLY:
input:
lr=expand(os.path.join(RESULTS_DIR, "assembly/lr/{tool}/assembly.fna"), tool=config["assemblers"]["lr"]),
sr=expand(os.path.join(RESULTS_DIR, "assembly/sr/{tool}/assembly.fna"), tool=config["assemblers"]["sr"]),
hy=expand(os.path.join(RESULTS_DIR, "assembly/hy/{tool}/assembly.fna"), tool=config["assemblers"]["hy"])
output:
"status/assembly.done"
shell:
"touch {output}"
\ No newline at end of file
# Preprocessing reads before de novo assembly
include:
'../rules/preprocessing_lr.smk',
'../rules/preprocessing_lr.smk'
include:
'../rules/preprocessing_sr.smk'
# NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`.
......@@ -9,9 +10,18 @@ include:
rule PREPROCESSING_LR:
input:
basecalling=expand(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz"), guppy_config=config["guppy"]["config"].keys()),
nanostats=expand(os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt"), guppy_config=config["guppy"]["config"].keys())
basecalling=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"),
nanostats=os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt")
output:
"status/preprocessing_lr.done"
shell:
"touch {output}"
rule PREPROCESSING_SR:
input:
fastp=expand(os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz"), rid=["R1", "R2"]),
fastqc=expand(os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_{ext}"), rid=["R1", "R2"], ext=["fastqc.html", "fastqc.zip"])
output:
"status/preprocessing_sr.done"
shell:
"touch {output}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment