Susheel Busi · b40e3bfb · 8b1ece35 · aeb752cd · e98663ed · 43c3dfab
--- a/2019_GDB/test_troubleshooting_snakefiles/WORKING_checkpoint_snakefile 0 → 100755

+ 302

− 0
+++ b/2019_GDB/test_troubleshooting_snakefiles/WORKING_checkpoint_snakefile 0 → 100755

+ 302

− 0
+# For running only the BASECALLING part of the workflow for ONT data
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "config/CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+RUNS=[config["runs"]["first"],
+    config["runs"]["second"]]
+#    config["runs"]["third"]]
+BARCODES=config["barcodes"]
+SAMPLES=["ONT3_MG_xx_Rashi_S11"]
+sr_sample=["ONT3_MG_xx_Rashi_S11"]
+# SAMPLES_ALL=config["samples"]
+REFERENCES=["igc", "hg38"]
+IGC_URI=config["igc"]["uri"]
+HG38_URI=config["hg38"]["uri"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa_mem", "minimap2"]
+
+# Rule all for running the assembly and annotation
+rule all:
+    input:
+        "basecall_merge_qc.done",
+        "basecall_merge_qc_NO_MOD.done",
+        "dummy_folders_created.done"
+
+######
+# WRAPPER RULES
+######
+# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
+rule BASECALL_MERGE_QC:
+    input:
+        expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
+        expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=BARCODES), 
+        expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES),
+        expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES)
+    output: "basecall_merge_qc.done"
+    shell:
+        """
+        touch {output}
+        """
+
+rule BASECALL_MERGE_QC_NO_MOD:
+    input:
+        expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz"), barcode=BARCODES),
+        expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=BARCODES),
+        expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt"), barcode=BARCODES),
+        expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES)
+    output: "basecall_merge_qc_NO_MOD.done"
+    shell:
+        """
+        touch {output}
+        """
+
+rule DUMMY_FOLDER:
+    input:
+#         expand(os.path.join(RESULTS_DIR, "basecalled/{run}/dummy_folder.done"), run=RUNS),
+         expand(os.path.join(RESULTS_DIR, "basecalled{type}/{run}/dummy_folder.done"), run=RUNS, type=["", "_NO_MOD"]),
+         expand(os.path.join(RESULTS_DIR, "basecalled{type}/{run}/linking.done"), run=RUNS, type=["", "_NO_MOD"])
+    output: "dummy_foliders_created.done"
+    shell:
+        """
+        touch {output}
+        """      
+
+######
+# RULES
+######
+# Basecalling the multi-FAST5 files.
+# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
+checkpoint guppy_gpu_basecall:
+    input: os.path.join(DATA_DIR, "multifast5/{run}")
+    output: directory(os.path.join(RESULTS_DIR, "basecalled/{run}/checkpoint_bc"))
+    log: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling")
+    threads: config["guppy_gpu"]["cpu_threads"]
+    shell:
+        """
+        (date &&\
+        {config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
+        touch {output} &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+checkpoint guppy_gpu_basecall_NO_MOD:
+    input: os.path.join(DATA_DIR, "multifast5/{run}")
+    output: directory(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/checkpoint_bc_NM"))
+    log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling")
+    wildcard_constraints:
+#        run='S.*'
+#        run='S.*^\/'
+#        run='\w+'
+#        run="^no.*"
+    threads: config["guppy_gpu"]["cpu_threads"]
+    shell:
+        """
+        (date &&\
+        {config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][hac_config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
+        touch {output} &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+## Demultiplex.
+## Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
+#rule guppy_gpu_demux:
+#    input: rules.guppy_gpu_basecall.output
+#    output: os.path.join(RESULTS_DIR, "basecalled/{run}/demux.done")
+#    log: os.path.join(RESULTS_DIR, "basecalled/{run}/demux")
+#    threads: config["guppy_barcoder"]["threads"]
+#    shell:
+#        """
+#        (date &&\
+#        {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
+#        touch {output} &&\
+#        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+#        """
+#rule guppy_gpu_demux_NO_MOD:
+#    input: rules.guppy_gpu_basecall_NO_MOD.output
+#    output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux_NO_MOD.done")
+#    log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux")
+#    threads: config["guppy_barcoder"]["threads"]
+#    shell:
+#        """
+#        (date &&\
+#        {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
+#        touch {output} &&\
+#        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+#        """
+
+# Modifications (IMPORTANT)
+# Since 2019_GDB sample was "NOT" barcoded the demux folders are obsolete. 
+# The below section gathers all the fastq.gz files from basecalling into a "dummy" folder
+# The "dummy" folder will be used downstream similar to the "barcode" folder used in the 2018 analyses
+# The barcode in the CONFIG file will be given the name of this dummy folder
+#rule intermediate_basecalling:
+#    input: os.path.join(RESULTS_DIR, "basecalled/{run}/{i}.fastq.gz")
+#    output: os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz")
+#    log: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode_{i}")
+#    shell:
+#        """
+#        (date &&\ 
+#        ln -s {input} {output}  &&\
+#        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+#        """
+#
+#rule intermediate_NO_MOD_basecalling:
+#    input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{i}.fastq.gz")
+#    output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_nobarcode/{i}.fastq.gz")
+#    log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode_{i}")
+#    shell:
+#        """
+#        (date &&\
+#        ln -s {input} {output}  &&\
+#        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+#        """
+rule intermediate_basecalling:
+    input: os.path.join(RESULTS_DIR, "basecalled/{run}/checkpoint_bc/{i}.fastq.gz")
+    output: os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz")
+    log: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode_{i}")
+    shell:
+        """
+        (date &&\
+        ln -s {input} {output}  &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+rule intermediate_NO_MOD_basecalling:
+    input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/checkpoint_bc_NM/{i}.fastq.gz")
+    output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_nobarcode/{i}.fastq.gz")
+    log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode_{i}")
+    shell:
+        """
+        (date &&\
+        ln -s {input} {output}  &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+def aggregate_dummy_basecalling(wildcards):
+    checkpoint_output = checkpoints.guppy_gpu_basecall.get(**wildcards).output[0]
+    return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{id}.fastq.gz"),
+        run=wildcards.run,
+        i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fastq.gz")).i)
+
+
+def aggregate_dummy_basecalling_NO_MOD(wildcards):
+    dummy_output=checkpoints.guppy_gpu_basecall_NO_MOD.get(**wildcards).output[0]
+    return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz"),
+        run=wildcards.run, 
+        i=glob_wildcards(os.path.join(dummy_output, "{i}.fastq.gz")).i)
+
+
+# Merge multiple basecalled files, i.e., FASTQ files per barcode and run. Each multi-FAST5 file will produce a respective FASTQ file.
+# TODO: Simplify output path to `basecalled/{run}/{barcode}.fastq.gz`. This will require to change `$(dirname {output})` to `$(dirname {output})/{wildcards.barcode}`
+rule merge_individual_fastq_per_barcode:
+    input: aggregate_dummy_basecalling
+    output: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz")
+    shell:
+        """
+        date
+        cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
+        touch {output}
+        date
+        """
+
+rule merge_individual_fastq_per_barcode_NO_MOD:
+    input: aggregate_dummy_basecalling_NO_MOD
+    output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz")
+    shell:
+        """
+        date
+        cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
+        touch {output}
+        date
+        """
+
+# Helper function to get the input in order to create a *single* fastq file for all runs with the *same* barcode
+# TODO: Adjust according to TODO-merge_individual_fastq_per_barcode
+def get_all_fastq_for_barcode(wildcards):
+    return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
+
+def get_all_fastq_for_barcode_NO_MOD(wildcards):
+    return expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
+
+# Merge basecalled files (FASTQ) per barcode, i.e., combine reads from multiple runs according to their barcode.
+rule get_single_fastq_per_barcode:
+    input: get_all_fastq_for_barcode
+    output: os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz")
+    shell:
+        """
+        date
+        cat {input} > {output}
+        date
+        """
+rule get_single_fastq_per_barcode_NO_MOD:
+    input: get_all_fastq_for_barcode_NO_MOD
+    output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz")
+    shell:
+        """
+        date
+        cat {input} > {output}
+        date
+        """
+
+# Get basic statistics of the ONT data; over all runs
+# TODO: Simplify top of path: replace `lr` by `nanostat`
+# TODO: Write more generic: have patterned `input: os.path.join(RESULTS_DIR, "basecalled/{prefix}/{barcode}.fastq.gz")` and patterned `output: os.path.join(RESULTS_DIR, "qc/nanostat/{prefix}/{barcode}/{barcode}NanoStats.txt")
+#
+rule nanostat:
+    input: rules.get_single_fastq_per_barcode.output
+    output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt")
+    log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats")
+    conda: "../envs/nanostat.yaml"
+    shell:
+        """
+        (date &&\
+        NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+rule nanostat_NO_MOD:
+    input: rules.get_single_fastq_per_barcode_NO_MOD.output
+    output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt")
+    log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD")
+    conda: "../envs/nanostat.yaml"
+    shell:
+        """
+        (date &&\
+        NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+# TODO: Deprecated if TODO-nanostat is completed
+# Get basic statistics of the ONT data; per run
+rule nanostat_per_run:
+    input: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz")
+    output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt")
+    wildcard_constraints:
+        run='S.*'
+    conda: "../envs/nanostat.yaml"
+    shell:
+        """
+        date
+        NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output})
+        date
+        """
+
+rule nanostat_per_run_NO_MOD:
+    input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz")
+    output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt")
+    wildcard_constraints:
+        run='S.*'
+    conda: "../envs/nanostat.yaml"
+    shell:
+        """
+        date
+        NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output})
+        date
+        """
+
+