Skip to content
Snippets Groups Projects

WIP: Checkpoint snakefile

Merged Susheel Busi requested to merge checkpoint_snakefile into master
Compare and Show latest version
15 files
+ 2438
0
Compare changes
  • Side-by-side
  • Inline
Files
15
# For running only the BASECALLING part of the workflow for ONT data
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
RUNS=[config["runs"]["first"],
config["runs"]["second"]]
# config["runs"]["third"]]
BARCODES=config["barcodes"]
SAMPLES=["ONT3_MG_xx_Rashi_S11"]
sr_sample=["ONT3_MG_xx_Rashi_S11"]
# SAMPLES_ALL=config["samples"]
REFERENCES=["igc", "hg38"]
IGC_URI=config["igc"]["uri"]
HG38_URI=config["hg38"]["uri"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa_mem", "minimap2"]
# Rule all for running the assembly and annotation
rule all:
input:
"basecall_merge_qc.done",
"basecall_merge_qc_NO_MOD.done",
"dummy_folders_created.done"
######
# WRAPPER RULES
######
# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
rule BASECALL_MERGE_QC:
input:
expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES)
output: "basecall_merge_qc.done"
shell:
"""
touch {output}
"""
rule BASECALL_MERGE_QC_NO_MOD:
input:
expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES)
output: "basecall_merge_qc_NO_MOD.done"
shell:
"""
touch {output}
"""
rule DUMMY_FOLDER:
input:
# expand(os.path.join(RESULTS_DIR, "basecalled/{run}/dummy_folder.done"), run=RUNS),
expand(os.path.join(RESULTS_DIR, "basecalled{type}/{run}/dummy_folder.done"), run=RUNS, type=["", "_NO_MOD"]),
expand(os.path.join(RESULTS_DIR, "basecalled{type}/{run}/linking.done"), run=RUNS, type=["", "_NO_MOD"])
output: "dummy_foliders_created.done"
shell:
"""
touch {output}
"""
######
# RULES
######
# Basecalling the multi-FAST5 files.
# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
checkpoint guppy_gpu_basecall:
input: os.path.join(DATA_DIR, "multifast5/{run}")
output: directory(os.path.join(RESULTS_DIR, "basecalled/{run}/checkpoint_bc"))
log: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling")
threads: config["guppy_gpu"]["cpu_threads"]
shell:
"""
(date &&\
{config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
checkpoint guppy_gpu_basecall_NO_MOD:
input: os.path.join(DATA_DIR, "multifast5/{run}")
output: directory(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/checkpoint_bc_NM"))
log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling")
wildcard_constraints:
# run='S.*'
# run='S.*^\/'
# run='\w+'
# run="^no.*"
threads: config["guppy_gpu"]["cpu_threads"]
shell:
"""
(date &&\
{config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][hac_config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
## Demultiplex.
## Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
#rule guppy_gpu_demux:
# input: rules.guppy_gpu_basecall.output
# output: os.path.join(RESULTS_DIR, "basecalled/{run}/demux.done")
# log: os.path.join(RESULTS_DIR, "basecalled/{run}/demux")
# threads: config["guppy_barcoder"]["threads"]
# shell:
# """
# (date &&\
# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
# touch {output} &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
#rule guppy_gpu_demux_NO_MOD:
# input: rules.guppy_gpu_basecall_NO_MOD.output
# output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux_NO_MOD.done")
# log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux")
# threads: config["guppy_barcoder"]["threads"]
# shell:
# """
# (date &&\
# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
# touch {output} &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
# Modifications (IMPORTANT)
# Since 2019_GDB sample was "NOT" barcoded the demux folders are obsolete.
# The below section gathers all the fastq.gz files from basecalling into a "dummy" folder
# The "dummy" folder will be used downstream similar to the "barcode" folder used in the 2018 analyses
# The barcode in the CONFIG file will be given the name of this dummy folder
#rule intermediate_basecalling:
# input: os.path.join(RESULTS_DIR, "basecalled/{run}/{i}.fastq.gz")
# output: os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz")
# log: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode_{i}")
# shell:
# """
# (date &&\
# ln -s {input} {output} &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
#
#rule intermediate_NO_MOD_basecalling:
# input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{i}.fastq.gz")
# output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_nobarcode/{i}.fastq.gz")
# log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode_{i}")
# shell:
# """
# (date &&\
# ln -s {input} {output} &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
rule intermediate_basecalling:
input: os.path.join(RESULTS_DIR, "basecalled/{run}/checkpoint_bc/{i}.fastq.gz")
output: os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz")
log: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode_{i}")
shell:
"""
(date &&\
ln -s {input} {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule intermediate_NO_MOD_basecalling:
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/checkpoint_bc_NM/{i}.fastq.gz")
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_nobarcode/{i}.fastq.gz")
log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode_{i}")
shell:
"""
(date &&\
ln -s {input} {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
def aggregate_dummy_basecalling(wildcards):
checkpoint_output = checkpoints.guppy_gpu_basecall.get(**wildcards).output[0]
return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{id}.fastq.gz"),
run=wildcards.run,
i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fastq.gz")).i)
def aggregate_dummy_basecalling_NO_MOD(wildcards):
dummy_output=checkpoints.guppy_gpu_basecall_NO_MOD.get(**wildcards).output[0]
return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz"),
run=wildcards.run,
i=glob_wildcards(os.path.join(dummy_output, "{i}.fastq.gz")).i)
# Merge multiple basecalled files, i.e., FASTQ files per barcode and run. Each multi-FAST5 file will produce a respective FASTQ file.
# TODO: Simplify output path to `basecalled/{run}/{barcode}.fastq.gz`. This will require to change `$(dirname {output})` to `$(dirname {output})/{wildcards.barcode}`
rule merge_individual_fastq_per_barcode:
input: aggregate_dummy_basecalling
output: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz")
shell:
"""
date
cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
touch {output}
date
"""
rule merge_individual_fastq_per_barcode_NO_MOD:
input: aggregate_dummy_basecalling_NO_MOD
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz")
shell:
"""
date
cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
touch {output}
date
"""
# Helper function to get the input in order to create a *single* fastq file for all runs with the *same* barcode
# TODO: Adjust according to TODO-merge_individual_fastq_per_barcode
def get_all_fastq_for_barcode(wildcards):
return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
def get_all_fastq_for_barcode_NO_MOD(wildcards):
return expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
# Merge basecalled files (FASTQ) per barcode, i.e., combine reads from multiple runs according to their barcode.
rule get_single_fastq_per_barcode:
input: get_all_fastq_for_barcode
output: os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz")
shell:
"""
date
cat {input} > {output}
date
"""
rule get_single_fastq_per_barcode_NO_MOD:
input: get_all_fastq_for_barcode_NO_MOD
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz")
shell:
"""
date
cat {input} > {output}
date
"""
# Get basic statistics of the ONT data; over all runs
# TODO: Simplify top of path: replace `lr` by `nanostat`
# TODO: Write more generic: have patterned `input: os.path.join(RESULTS_DIR, "basecalled/{prefix}/{barcode}.fastq.gz")` and patterned `output: os.path.join(RESULTS_DIR, "qc/nanostat/{prefix}/{barcode}/{barcode}NanoStats.txt")
#
rule nanostat:
input: rules.get_single_fastq_per_barcode.output
output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt")
log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats")
conda: "../envs/nanostat.yaml"
shell:
"""
(date &&\
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule nanostat_NO_MOD:
input: rules.get_single_fastq_per_barcode_NO_MOD.output
output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt")
log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD")
conda: "../envs/nanostat.yaml"
shell:
"""
(date &&\
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# TODO: Deprecated if TODO-nanostat is completed
# Get basic statistics of the ONT data; per run
rule nanostat_per_run:
input: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz")
output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt")
wildcard_constraints:
run='S.*'
conda: "../envs/nanostat.yaml"
shell:
"""
date
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output})
date
"""
rule nanostat_per_run_NO_MOD:
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz")
output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt")
wildcard_constraints:
run='S.*'
conda: "../envs/nanostat.yaml"
shell:
"""
date
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output})
date
"""
Loading