Skip to content
Snippets Groups Projects
Commit 6cbf5ed6 authored by Valentina Galata's avatar Valentina Galata
Browse files

init setup for preprocessing and assembly

parent ab685564
No related branches found
No related tags found
1 merge request!76Merge "cleanup" branch with "master" branch
...@@ -39,7 +39,7 @@ INPUT_FAST5 = [] ...@@ -39,7 +39,7 @@ INPUT_FAST5 = []
INPUT_FAST5 += [os.path.abspath(f) for f in config["data"]["ont"]["files"]] INPUT_FAST5 += [os.path.abspath(f) for f in config["data"]["ont"]["files"]]
for ont_d in config["data"]["ont"]["dirs"]: for ont_d in config["data"]["ont"]["dirs"]:
INPUT_FAST5 += [os.path.abspath(f) for f in Path(ont_d).rglob('*.fast5')] INPUT_FAST5 += [os.path.abspath(f) for f in Path(ont_d).rglob('*.fast5')]
INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST # INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
# SR files # SR files
INPUT_SR = config["data"]["sr"] INPUT_SR = config["data"]["sr"]
# MetaT files # MetaT files
...@@ -49,19 +49,14 @@ INPUT_METAT = config["data"]["metat"] ...@@ -49,19 +49,14 @@ INPUT_METAT = config["data"]["metat"]
DATA_FAST5 = [os.path.join(RESULTS_DIR, "input_ont", os.path.basename(f)) for f in INPUT_FAST5] DATA_FAST5 = [os.path.join(RESULTS_DIR, "input_ont", os.path.basename(f)) for f in INPUT_FAST5]
assert len(DATA_FAST5) == len(set(DATA_FAST5)), "Created link names for FAST5 files are NOT unique: {}".format(DATA_FAST5) assert len(DATA_FAST5) == len(set(DATA_FAST5)), "Created link names for FAST5 files are NOT unique: {}".format(DATA_FAST5)
DATA_SR = { DATA_SR = {
"r1": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r1"])), "r1": os.path.join(RESULTS_DIR, "input_sr/R1.fq.gz"),
"r2": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r2"])) "r2": os.path.join(RESULTS_DIR, "input_sr/R2.fq.gz")
} }
DATA_METAT = { DATA_METAT = {
"r1": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r1"])), "r1": os.path.join(RESULTS_DIR, "input_metat/R1.fq.gz"),
"r2": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r2"])) "r2": os.path.join(RESULTS_DIR, "input_metat/R2.fq.gz")
} }
# # Tools
# ASSEMBLERS = config["assemblers"]
# HYBRID_ASSEMBLER = config["hybrid_assembler"]
# MAPPERS = ["bwa", "mmi"]
############################## ##############################
# TARGETS & RULES # TARGETS & RULES
# List of targets to be created # List of targets to be created
...@@ -74,11 +69,22 @@ include: ...@@ -74,11 +69,22 @@ include:
"workflow/steps/prepare_input.smk" "workflow/steps/prepare_input.smk"
# TARGETS.append("status/prepare_input.done") # TARGETS.append("status/prepare_input.done")
# Basecalling # Preprocessing
if "preprocessing" in STEPS: if "preprocessing" in STEPS:
include: include:
"workflow/steps/preprocessing.smk" "workflow/steps/preprocessing.smk"
TARGETS.append("status/preprocessing_lr.done") TARGETS += [
"status/preprocessing_lr.done",
"status/preprocessing_sr.done"
]
# Assembly
if "assembly" in STEPS:
include:
"workflow/steps/assembly.smk"
TARGETS += [
"status/assembly.done"
]
# # Assembly annotation # # Assembly annotation
# if 'assembly_annotation' in STEPS: # if 'assembly_annotation' in STEPS:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Pipeline steps # Pipeline steps
# steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"] # steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"]
steps: ["preprocessing"] steps: ["preprocessing", "assembly"]
# Analysis sub-steps # Analysis sub-steps
analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"] analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
...@@ -15,7 +15,7 @@ analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"] ...@@ -15,7 +15,7 @@ analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
# work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB" # work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
work_dir: "/scratch/users/vgalata/ont_pilot" work_dir: "/scratch/users/vgalata/ont_pilot"
# paths within the working directory # Paths WITHIN the working directory
# directory containing required DBs # directory containing required DBs
db_dir: "dbs" db_dir: "dbs"
# results directory # results directory
...@@ -38,16 +38,14 @@ data: ...@@ -38,16 +38,14 @@ data:
# List of FAST5 files # List of FAST5 files
files: [] files: []
binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"] # binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
############################################################ ############################################################
# TOOLS # TOOLS
# Basecalling # Preprocessing: LR: Basecalling
guppy: guppy:
config: config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
methylation_aware: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
# not_methylation_aware: "dna_r9.4.1_450bps_hac.cfg"
# cpu: # cpu:
# path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin" # path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
# bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller" # bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
...@@ -63,8 +61,7 @@ guppy: ...@@ -63,8 +61,7 @@ guppy:
num_callers: 4 num_callers: 4
runners_per_device: 2 runners_per_device: 2
gpu_device: "cuda:0" gpu_device: "cuda:0"
# threads: 28 threads: 20
threads: 5
# barcoder: # barcoder:
# path: "/home/users/sbusi/apps/ont-guppy/bin" # path: "/home/users/sbusi/apps/ont-guppy/bin"
# bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder" # bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
...@@ -72,132 +69,147 @@ guppy: ...@@ -72,132 +69,147 @@ guppy:
# records_per_fastq: 8000 # records_per_fastq: 8000
# threads: 8 # threads: 8
# assemblers: ["flye"] # Preprocessing: SR
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye"]
hy: ["metaspadeshybrid"]
p7zip:
bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
threads: 4
ont_fast5_api:
single_to_multi_fast5:
bin: "single_to_multi_fast5"
batch: 8000
threads: 8
nanostats:
fastp: fastp:
threads: 10
min_length: 40 min_length: 40
minimap2: # QC
threads: 16 fastqc:
threads: 10
igc: params: "-q -f fastq"
uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
hg38:
uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
genomecov:
bin: "bedtools genomecov"
compute_avg_coverage:
bin: "scripts/coverage.awk"
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools: # Assembly
sort: assemblers:
threads: 4 sr: ["megahit", "metaspades"]
chunk_size: "4G" lr: ["flye"]
view: hy: ["metaspadeshybrid"]
threads: 4
flye: flye:
bin: "flye" threads: 10
threads: 27
genome_size: "1g" genome_size: "1g"
operams: metaspades:
bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl" threads: 10
threads: 28
megahit: megahit:
threads: 28 threads: 10
nonpareil: # p7zip:
memory: 4096 # bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
threads: 14 # threads: 4
# ont_fast5_api:
medaka: # single_to_multi_fast5:
threads: 28 # bin: "single_to_multi_fast5"
# batch: 8000
racon: # threads: 8
threads: 28
# minimap2:
rebaler: # threads: 16
threads: 28
# igc:
diamond: # uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
threads: 28
#db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd" # hg38:
db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd" # uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
metaspades: # genomecov:
threads: 28 # bin: "bedtools genomecov"
mmseq2: # compute_avg_coverage:
threads: 24 # bin: "scripts/coverage.awk"
# Hybrid assembler # bwa:
hybrid_assembler: "metaspades_hybrid" # threads: 24
# long_reads_index:
# Number of cpus or threads to use # opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
threads: 28
# samtools:
# kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/" # sort:
kraken2: # threads: 4
db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/" # chunk_size: "4G"
# view:
# Binning # threads: 4
DAS_Tool:
path: "/home/users/sbusi/apps/DAS_Tool-master" # flye:
bin: "/home/users/sbusi/apps/DAS_Tool-master/src/" # bin: "flye"
db: "/home/users/sbusi/apps/DAS_Tool-master/db/" # threads: 27
Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/" # genome_size: "1g"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/" # operams:
# bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
# XXX # threads: 28
GTDBTK:
DATA: "/home/users/sbusi/apps/db/gtdbtk/release89" # megahit:
# threads: 28
# XXX
mmseqs: # nonpareil:
path: "/home/users/sbusi/apps/mmseqs/bin" # memory: 4096
createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb" # threads: 14
rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis" # medaka:
# threads: 28
# CRISPR
CASC: # racon:
PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin" # threads: 28
PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
minced: # rebaler:
PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/" # threads: 28
# Plasmid prediction # diamond:
plasflow: # threads: 28
threshold: 0.7 # class. prob. threshold # #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
minlen: 1000 # rm contigs with length below this threshold # db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
# AMR prediction # metaspades:
rgi: # threads: 28
db_url: "https://card.mcmaster.ca/latest/data"
alignment_tool: "DIAMOND" # DIAMOND or BLAST # mmseq2:
# threads: 24
# # Hybrid assembler
# hybrid_assembler: "metaspades_hybrid"
# # Number of cpus or threads to use
# threads: 28
# # kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# kraken2:
# db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# # Binning
# DAS_Tool:
# path: "/home/users/sbusi/apps/DAS_Tool-master"
# bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
# db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# # XXX
# GTDBTK:
# DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# # XXX
# mmseqs:
# path: "/home/users/sbusi/apps/mmseqs/bin"
# createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
# rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
# convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
# # CRISPR
# CASC:
# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
# PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
# minced:
# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
# # Plasmid prediction
# plasflow:
# threshold: 0.7 # class. prob. threshold
# minlen: 1000 # rm contigs with length below this threshold
# # AMR prediction
# rgi:
# db_url: "https://card.mcmaster.ca/latest/data"
# alignment_tool: "DIAMOND" # DIAMOND or BLAST
__default__: __default__:
time: "2-00:00:00" time: "0-02:00:00"
partition: "batch" partition: "batch"
qos: "qos-batch" qos: "qos-batch"
nodes: 1 nodes: 1
n: 1 n: 1
# ncpus: 1 explicit: ""
job-name: "ONT_pilot.{rule}" # job-name: "ONT_pilot.{rule}"
# output: "slurm-%j.%N-%x.out" # output: "slurm-%j.%N-%x.out"
# error: "slurm-%j.%N-%x.err" # error: "slurm-%j.%N-%x.err"
explicit: ""
# mail-type: "end" # mail-type: "end"
# Preprocessing
guppy_gpu_basecalling: guppy_gpu_basecalling:
time: "01-00:00:00" time: "01-00:00:00"
partition: "gpu" partition: "gpu"
qos: "qos-gpu" qos: "qos-gpu"
nodes: 1 nodes: 1
n: 1 n: 1
# ncpus: 28
explicit: "--gres=gpu:1" explicit: "--gres=gpu:1"
# "run_fastp_on_short_reads": fastp_sr:
# { time: "00-04:00:00"
# "time": "00-04:00:00", partition: "batch"
# "n": 1, qos: "qos-batch"
# "ncpus": 3, nodes: 1
# "partition": "batch", n: 1
# "qos": "qos-batch", explicit: ""
# "mail-type": "ALL"
# }, # Assembly
assembly_lr_flye:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assembly_sr_megahit:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assembly_sr_metaspades:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assemble_hy_metaspades:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
# "mmseq2_compare": # "mmseq2_compare":
# { # {
# "n": 1, # "n": 1,
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
# conda env name # conda env name
ONTP_ENV="ONT_pilot" ONTP_ENV="ONT_pilot"
# number of cores for snakemake # number of cores for snakemake
ONTP_CORES=30 ONTP_CORES=10
# IMP config file # IMP config file
ONTP_CONFIG="config/config.yaml" # USER INPUT REQUIRED ONTP_CONFIG="config/config.yaml" # USER INPUT REQUIRED
# slurm config file # slurm config file
......
# Assembly
# Long reads
rule assembly_lr_flye:
input:
os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/lr/flye/assembly.fna")
threads:
config["flye"]["threads"]
log:
out="logs/assembly_lr.flye.out.log",
err="logs/assembly_lr.flye.err.log"
conda:
"../envs/flye_v2_7.yaml"
message:
"Assembly: long reads: Flye"
shell:
"(date && flye --nano-raw {input} --meta --out-dir $(dirname {output}) --genome-size {config[flye][genome_size]} --threads {threads} && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf assembly.fasta $(basename {output})"
# Short reads
rule assembly_sr_megahit:
input:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/sr/megahit/assembly.fna")
log:
out="logs/assembly_sr.megahit.out.log",
err="logs/assembly_sr.megahit.err.log"
threads:
config["megahit"]["threads"]
conda:
os.path.join(ENV_DIR, "megahit.yaml")
message:
"Assembly: short reads: MEGAHIT"
shell:
"(date && megahit -1 {input.r1} -2 {input.r2} -t {threads} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf final.contigs.fa $(basename {output})"
rule assembly_sr_metaspades:
input:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/sr/metaspades/assembly.fna")
log:
out="logs/assembly_sr.metaspades.out.log",
err="logs/assembly_sr.metaspades.err.log"
threads:
config["metaspades"]["threads"]
conda:
os.path.join(ENV_DIR, "spades.yaml")
message:
"Assembly: short reads: MetaSPAdes"
shell:
"(date && metaspades.py -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
# Hybrid
rule assemble_hy_metaspades:
input:
lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"),
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/hy/metaspadeshybrid/assembly.fna")
log:
out="logs/assembly_hy.metaspades.out.log",
err="logs/assembly_hy.metaspades.err.log"
threads:
config["metaspades"]["threads"]
conda:
os.path.join(ENV_DIR, "spades.yaml")
message:
"Assembly: hybrid: MetaSPAdes"
shell:
"(date && spades.py --meta -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} --nanopore {input.lr} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
# Preprocessing of long reads
# Basecalling # Basecalling
# Guppy
checkpoint guppy_gpu_basecalling: checkpoint guppy_gpu_basecalling:
input: input:
DATA_FAST5 DATA_FAST5
output: output:
directory(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints")) directory(os.path.join(RESULTS_DIR, "preproc/lr/checkpoints"))
log: log:
out="logs/basecalling.{guppy_config}.out.log", out="logs/preproc_lr.guppy.out.log",
err="logs/basecalling.{guppy_config}.err.log" err="logs/preproc_lr.guppy.err.log"
wildcard_constraints:
guppy_config="|".join(config["guppy"]["config"].keys())
threads: threads:
config["guppy"]["gpu"]["threads"] config["guppy"]["gpu"]["threads"]
params: message:
config=lambda wildcards: config["guppy"]["config"][wildcards.guppy_config] "Preprocessing long reads: Basecalling w/ Guppy"
shell: shell:
""" """
(date && \ (date && \
{config[guppy][gpu][bin]} --input_path $(dirname {input[0]}) --save_path $(dirname {output}) \ {config[guppy][gpu][bin]} --input_path $(dirname {input[0]}) --save_path $(dirname {output}) \
--config {params.config} \ --config config[guppy][config] \
--disable_pings --compress_fastq \ --disable_pings --compress_fastq \
--cpu_threads_per_caller {threads} \ --cpu_threads_per_caller {threads} \
-x {config[guppy][gpu][gpu_device]} \ -x {config[guppy][gpu][gpu_device]} \
...@@ -33,8 +32,7 @@ checkpoint guppy_gpu_basecalling: ...@@ -33,8 +32,7 @@ checkpoint guppy_gpu_basecalling:
def aggregate_guppy_basecalling(wildcards): def aggregate_guppy_basecalling(wildcards):
checkpoint_output = checkpoints.guppy_gpu_basecalling.get(**wildcards).output[0] checkpoint_output = checkpoints.guppy_gpu_basecalling.get(**wildcards).output[0]
return expand( return expand(
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"), os.path.join(RESULTS_DIR, "preproc/lr/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
guppy_config=wildcards.guppy_config,
runid_i_j=glob_wildcards(os.path.join(checkpoint_output, "fastq_runid_{runid_i_j}.fastq.gz")).runid_i_j, runid_i_j=glob_wildcards(os.path.join(checkpoint_output, "fastq_runid_{runid_i_j}.fastq.gz")).runid_i_j,
) )
...@@ -42,22 +40,24 @@ rule merge_guppy_basecalling: ...@@ -42,22 +40,24 @@ rule merge_guppy_basecalling:
input: input:
aggregate_guppy_basecalling aggregate_guppy_basecalling
output: output:
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz") os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
wildcard_constraints: message:
guppy_config="|".join(config["guppy"]["config"].keys()) "Preprocessing long reads: Cat FASTQ"
shell: shell:
"cat $(echo \"{input}\" | sort) > {output}" "cat $(echo \"{input}\" | sort) > {output}"
# QC # QC
rule nanostat: rule nanostat_guppy_basecalling:
input: input:
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz") os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
output: output:
os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt") os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt")
log: log:
out="logs/nanostats.{guppy_config}.out.log", out="logs/preproc_lr.nanostats.out.log",
err="logs/nanostats.{guppy_config}.err.log" err="logs/preproc_lr.nanostats.err.log"
conda: conda:
os.path.join(ENV_DIR, "nanostat.yaml") os.path.join(ENV_DIR, "nanostat.yaml")
message:
"Preprocessing long reads: NanoStats"
shell: shell:
"(date && NanoStat --fastq {input} --outdir $(dirname {output}) -n $(basename {output}) && date) 2> {log.err} > {log.out}" "(date && NanoStat --fastq {input} --outdir $(dirname {output}) -n $(basename {output}) && date) 2> {log.err} > {log.out}"
# Preprocessing of short reads
# Preprocess the short reads using fastp
rule fastp_sr:
input:
r1=DATA_SR["r1"],
r2=DATA_SR["r2"]
output:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz"),
html=os.path.join(RESULTS_DIR, "preproc/sr/fastp.html"),
json=os.path.join(RESULTS_DIR, "preproc/sr/fastp.json")
log:
out="logs/preproc_sr.fastp.out.log",
err="logs/preproc_sr.fastp.err.log"
threads:
config["fastp"]["threads"]
conda:
os.path.join(ENV_DIR, "fastp.yaml")
message:
"Preprocessing short reads: FastP"
shell:
"(date && fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} -w {threads} && date) 2> {log.err} > {log.out}"
rule fastqc_fastp_sr:
input:
os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz")
output:
html=os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_fastqc.html"),
zip=os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_fastqc.zip")
log:
out="logs/preproc_sr.fastqc.{rid}.out.log",
err="logs/preproc_sr.fastqc.{rid}.err.log"
wildcard_constraints:
rid="|".join(["R1", "R2"])
threads:
config["fastqc"]["threads"]
conda:
os.path.join(ENV_DIR, "fastqc.yaml")
message:
"Preprocessing short reads: FastQC"
shell:
"(date && fastqc {config[fastqc][params]} -t {threads} -o $(dirname {output.html}) {input} && date) 2> {log.err} > {log.out}"
# Assembly
include:
'../rules/assembly.smk'
# NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`.
# This is needed so that an email can be sent upon event changes for this rule.
rule ASSEMBLY:
input:
lr=expand(os.path.join(RESULTS_DIR, "assembly/lr/{tool}/assembly.fna"), tool=config["assemblers"]["lr"]),
sr=expand(os.path.join(RESULTS_DIR, "assembly/sr/{tool}/assembly.fna"), tool=config["assemblers"]["sr"]),
hy=expand(os.path.join(RESULTS_DIR, "assembly/hy/{tool}/assembly.fna"), tool=config["assemblers"]["hy"])
output:
"status/assembly.done"
shell:
"touch {output}"
\ No newline at end of file
# Preprocessing reads before de novo assembly # Preprocessing reads before de novo assembly
include: include:
'../rules/preprocessing_lr.smk', '../rules/preprocessing_lr.smk'
include:
'../rules/preprocessing_sr.smk' '../rules/preprocessing_sr.smk'
# NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`. # NOTE: Using "shell: touch ..." to avoid the rule from being autodetected as `localrule`.
...@@ -9,9 +10,18 @@ include: ...@@ -9,9 +10,18 @@ include:
rule PREPROCESSING_LR: rule PREPROCESSING_LR:
input: input:
basecalling=expand(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz"), guppy_config=config["guppy"]["config"].keys()), basecalling=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"),
nanostats=expand(os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt"), guppy_config=config["guppy"]["config"].keys()) nanostats=os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt")
output: output:
"status/preprocessing_lr.done" "status/preprocessing_lr.done"
shell: shell:
"touch {output}" "touch {output}"
rule PREPROCESSING_SR:
input:
fastp=expand(os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz"), rid=["R1", "R2"]),
fastqc=expand(os.path.join(RESULTS_DIR, "qc/sr/{rid}.fastp_{ext}"), rid=["R1", "R2"], ext=["fastqc.html", "fastqc.zip"])
output:
"status/preprocessing_sr.done"
shell:
"touch {output}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment