From 915b2d7527d34913903bf315c2918471212174bd Mon Sep 17 00:00:00 2001 From: "Susheel Bhanu Busi susheel.busi@uni.lu" <susheel.busi@uni.lu> Date: Wed, 27 May 2020 07:23:54 +0200 Subject: [PATCH] collected miscellaneous files into one folder --- .../basecalling_snakefile | 269 +++++++++++++++++ .../checkpoint_SNAKEFILE | 57 ++++ test_troubleshooting_snakefiles/map_snakefile | 165 +++++++++++ .../no_barcode_snakefile | 219 ++++++++++++++ .../test_snakefile | 280 ++++++++++++++++++ 5 files changed, 990 insertions(+) create mode 100755 test_troubleshooting_snakefiles/basecalling_snakefile create mode 100755 test_troubleshooting_snakefiles/checkpoint_SNAKEFILE create mode 100644 test_troubleshooting_snakefiles/map_snakefile create mode 100755 test_troubleshooting_snakefiles/no_barcode_snakefile create mode 100755 test_troubleshooting_snakefiles/test_snakefile diff --git a/test_troubleshooting_snakefiles/basecalling_snakefile b/test_troubleshooting_snakefiles/basecalling_snakefile new file mode 100755 index 0000000..57e5ecf --- /dev/null +++ b/test_troubleshooting_snakefiles/basecalling_snakefile @@ -0,0 +1,269 @@ +# For running only the BASECALLING part of the workflow for ONT data + +import os +from tempfile import TemporaryDirectory + +configfile: "config/CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +RUNS=[config["runs"]["first"], + config["runs"]["second"]] +# config["runs"]["third"]] +BARCODES=config["barcodes"] +SAMPLES=["ONT3_MG_xx_Rashi_S11"] +sr_sample=["ONT3_MG_xx_Rashi_S11"] +# SAMPLES_ALL=config["samples"] +REFERENCES=["igc", "hg38"] +IGC_URI=config["igc"]["uri"] +HG38_URI=config["hg38"]["uri"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa_mem", "minimap2"] + +# Rule all for running the assembly and annotation +rule all: + input: + "basecall_merge_qc.done", + "basecall_merge_qc_NO_MOD.done", + "dummy_folders_created.done" + +###### +# WRAPPER RULES +###### +# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule. +rule BASECALL_MERGE_QC: + input: + expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES) + output: "basecall_merge_qc.done" + shell: + """ + touch {output} + """ + +rule BASECALL_MERGE_QC_NO_MOD: + input: + expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz"), barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt"), barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES) + output: "basecall_merge_qc_NO_MOD.done" + shell: + """ + touch {output} + """ + +rule DUMMY_FOLDER: + input: +# expand(os.path.join(RESULTS_DIR, "basecalled/{run}/dummy_folder.done"), run=RUNS), + expand(os.path.join(RESULTS_DIR, "basecalled{type}/{run}/dummy_folder.done"), run=RUNS, type=["", "_NO_MOD"]) + output: "dummy_folders_created.done" + shell: + """ + touch {output} + """ + +###### +# RULES +###### +# Basecalling the multi-FAST5 files. +# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence. +rule guppy_gpu_basecall: + input: os.path.join(DATA_DIR, "multifast5/{run}") + output: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling.done") + log: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling") + threads: config["guppy_gpu"]["cpu_threads"] + shell: + """ + (date &&\ + {config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \ + touch {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule guppy_gpu_basecall_NO_MOD: + input: os.path.join(DATA_DIR, "multifast5/{run}") + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling.done") + log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling") + threads: config["guppy_gpu"]["cpu_threads"] + shell: + """ + (date &&\ + {config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][hac_config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \ + touch {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +# Demultiplex. +# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence. +rule guppy_gpu_demux: + input: rules.guppy_gpu_basecall.output + output: os.path.join(RESULTS_DIR, "basecalled/{run}/demux.done") + log: os.path.join(RESULTS_DIR, "basecalled/{run}/demux") + threads: config["guppy_barcoder"]["threads"] + shell: + """ + (date &&\ + {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \ + touch {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ +rule guppy_gpu_demux_NO_MOD: + input: rules.guppy_gpu_basecall_NO_MOD.output + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux_NO_MOD.done") + log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux") + threads: config["guppy_barcoder"]["threads"] + shell: + """ + (date &&\ + {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \ + touch {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +# Modifications (IMPORTANT) +# Since 2019_GDB sample was "NOT" barcoded the demux folders are obsolete. +# The below section gathers all the fastq.gz files from basecalling into a "dummy" folder +# The "dummy" folder will be used downstream similar to the "barcode" folder used in the 2018 analyses +# The barcode in the CONFIG file will be given the name of this dummy folder +rule create_dummy_folder_basecalling: + input: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling.done") + output: + out2=os.path.join(RESULTS_DIR, "basecalled/{run}/dummy_folder.done") + log: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode") + shell: + """ + (date &&\ + mkdir $(dirname {input})/no_barcode &&\ + cp -vrf $(dirname {input})/*.gz $(dirname {input})/no_barcode/. &&\ + touch {output.out2} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ +# for file in $(dirname {input})/*.gz; do ln -s "$file" $(dirname {input})/no_barcode/`basename "$file"`; done &&\ + +rule create_dummy_folder_NO_MOD_basecalling: + input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling.done") + output: + out2=os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/dummy_folder.done") + log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode") + shell: + """ + (date &&\ + mkdir $(dirname {input})/no_barcode &&\ + cp -vrf $(dirname {input})/*.gz $(dirname {input})/no_barcode/. &&\ + touch {output.out2} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ +# for file in $(dirname {input})/*.gz; do ln -s "$file" $(dirname {input})/no_barcode/`basename "$file"`; done &&\ +# +# Merge multiple basecalled files, i.e., FASTQ files per barcode and run. Each multi-FAST5 file will produce a respective FASTQ file. +# TODO: Simplify output path to `basecalled/{run}/{barcode}.fastq.gz`. This will require to change `$(dirname {output})` to `$(dirname {output})/{wildcards.barcode}` +rule merge_individual_fastq_per_barcode: +# input: rules.create_dummy_folder_basecalling.output.out1 + input: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode") + output: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz") + shell: + """ + date + cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output} + touch {output} + date + """ + +rule merge_individual_fastq_per_barcode_NO_MOD: +# input: rules.create_dummy_folder_NO_MOD_basecalling.output.out1 + input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode") + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz") + shell: + """ + date + cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output} + touch {output} + date + """ + +# Helper function to get the input in order to create a *single* fastq file for all runs with the *same* barcode +# TODO: Adjust according to TODO-merge_individual_fastq_per_barcode +def get_all_fastq_for_barcode(wildcards): + return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode) + +def get_all_fastq_for_barcode_NO_MOD(wildcards): + return expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode) + +# Merge basecalled files (FASTQ) per barcode, i.e., combine reads from multiple runs according to their barcode. +rule get_single_fastq_per_barcode: + input: get_all_fastq_for_barcode + output: os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz") + shell: + """ + date + cat {input} > {output} + date + """ +rule get_single_fastq_per_barcode_NO_MOD: + input: get_all_fastq_for_barcode_NO_MOD + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz") + shell: + """ + date + cat {input} > {output} + date + """ + +# Get basic statistics of the ONT data; over all runs +# TODO: Simplify top of path: replace `lr` by `nanostat` +# TODO: Write more generic: have patterned `input: os.path.join(RESULTS_DIR, "basecalled/{prefix}/{barcode}.fastq.gz")` and patterned `output: os.path.join(RESULTS_DIR, "qc/nanostat/{prefix}/{barcode}/{barcode}NanoStats.txt") +# +rule nanostat: + input: rules.get_single_fastq_per_barcode.output + output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt") + log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats") + conda: "../envs/nanostat.yaml" + shell: + """ + (date &&\ + NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule nanostat_NO_MOD: + input: rules.get_single_fastq_per_barcode_NO_MOD.output + output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt") + log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD") + conda: "../envs/nanostat.yaml" + shell: + """ + (date &&\ + NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +# TODO: Deprecated if TODO-nanostat is completed +# Get basic statistics of the ONT data; per run +rule nanostat_per_run: + input: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz") + output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt") + wildcard_constraints: + run='S.*' + conda: "../envs/nanostat.yaml" + shell: + """ + date + NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) + date + """ + +rule nanostat_per_run_NO_MOD: + input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz") + output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt") + wildcard_constraints: + run='S.*' + conda: "../envs/nanostat.yaml" + shell: + """ + date + NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) + date + """ + + diff --git a/test_troubleshooting_snakefiles/checkpoint_SNAKEFILE b/test_troubleshooting_snakefiles/checkpoint_SNAKEFILE new file mode 100755 index 0000000..498e49c --- /dev/null +++ b/test_troubleshooting_snakefiles/checkpoint_SNAKEFILE @@ -0,0 +1,57 @@ +# File for running ONT analyses + +# default configuration file +configfile:"config/CONFIG.yaml" + +# default executable for snakmake +shell.executable("bash") + +# input settings +RUNS=config['runs']['first'] +STEPS=config['steps'] + +# include rules for the workflows based on "steps" in the CONFIG.yaml file +# ONT analyses workflow +TARGETS = [] + +if 'assembly_annotation' in STEPS: + include: "workflows/checkpoint_assembly_annotation.smk" + TARGETS += ["assemble_and_coverage.done", + "annotate.done", + "basecall_merge_qc.done", + "coverage_of_references.done", + "prodigal_gene_call.done", + "diamond_proteins.done"] + +if 'mmseq' in STEPS: + include: "workflows/mmseq.smk" + TARGETS += ["mmseq_comparison_for_ont.done"] + +if 'metaT' in STEPS: + include: "workflows/metat.smk" + TARGETS += ["metaT_mapping_for_ONT.done"] + +if 'mapping' in STEPS: + include: "workflows/mapping.smk" + TARGETS += ["mapping_for_binning.done"] + +if 'binning' in STEPS: + include: "workflows/binning.smk" + TARGETS += ["binning_for_ont.done"] + +if 'taxonomy' in STEPS: + include: "workflows/taxonomy.smk" + TARGETS += ["taxonomy_for_ont.done"] + +if 'analysis' in STEPS: + include: "workflows/analysis.smk" + TARGETS += ["analysis.done"] + +#else: +# raise Exception('You are not serious. No input data') +# print("No input data provided") + + +rule all: + input: + TARGETS diff --git a/test_troubleshooting_snakefiles/map_snakefile b/test_troubleshooting_snakefiles/map_snakefile new file mode 100644 index 0000000..f05d94e --- /dev/null +++ b/test_troubleshooting_snakefiles/map_snakefile @@ -0,0 +1,165 @@ +# Workflow for running mapping steps of different assemblers, and mappers for the Binning workflow + +#shell.executable("/bin/bash") +#shell.prefix("source ~/.bashrc; ") +import os +from tempfile import TemporaryDirectory + +configfile: "config/CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mmi"] +SAMPLES=["flye", "megahit", "metaspades_hybrid"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + +# specify which rules to run +#include: +# '../rules/MAPPING_RULES' + +# Rule all for mapping +rule MAPPING: + input: + expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), hybrid_assembler=HYBRID_ASSEMBLER), + expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_{reads}_{hybrid_assembler}/{mapper}_{reads}_{hybrid_assembler}.bam"), mapper=MAPPERS, reads=["sr", "lr"], hybrid_assembler=HYBRID_ASSEMBLER), + expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), hybrid_assembler=HYBRID_ASSEMBLER), + expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam"), mapper=MAPPERS, hybrid_assembler=HYBRID_ASSEMBLER) + output: + touch('mapping_for_binning.done') + +####################### +## Mapping to hybrid ## +####################### +rule build_hybrid_bwa_index: + input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER) + output: +# done=touch("bwa_index.done") + index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.amb"), + index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.ann"), + index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwt"), + index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.pac"), + index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.sa") + params: + prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}") + log: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwa_index") + conda: "../envs/bwa.yaml" + shell: """ + (date &&\ + bwa index {input} -p {params.prefix} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_map_short_reads_to_hybrid_contigs_w_bwa_mem: + input: + query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"), + query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"), + index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"), + index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"), + index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), + index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"), + index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"), + index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER) + output: + os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}.bam") + params: + index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"), + out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}") + log: os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/sr_contigs.bwa_mem_samtools") + conda: "../envs/bwa.yaml" + threads: + config["bwa"]["threads"] +# wildcard_constraints: +# sample='\w+' + shell: """ + (date &&\ + bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_map_long_reads_to_hybrid_contigs_w_bwa_mem: + input: + query_lr=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES), + index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"), + index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"), + index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), + index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"), + index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"), + index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER) + output: + os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}.bam") + params: + index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"), + out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}"), + log: os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/lr_contigs.bwa_mem_samtools") + conda: "../envs/bwa.yaml" + threads: config["bwa"]["threads"] +# wildcard_constraints: +# sample='\w+' + shell: """ + (date &&\ + bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_build_minimap2_long_read_contigs_ont_index: + input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER) + output: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi") + conda: "../envs/minimap2.yaml" + shell: + """ + date + minimap2 -x map-ont -d {output} {input} + date + """ + +rule hybrid_map_long_reads_to_hybrid_contigs_w_minimap2: + input: + query=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode="no_barcode"), + index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), + index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER) + output: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}.bam") + params: + prefix=os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}") + conda: "../envs/minimap2.yaml" + threads: config["minimap2"]["threads"] + log: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/metaspades.minimap2_samtools") + shell: """ + (date &&\ + minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_map_short_reads_to_hybrid_contigs_w_minimap2: + input: + query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"), + query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"), + index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), + index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER) + output: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}.bam") + params: + prefix=os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}") + conda: "../envs/minimap2.yaml" + threads: config["minimap2"]["threads"] + log: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/metaspades.minimap2_samtools") + shell: + """ + (date &&\ + minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_merge_bam_files_for_metaspades: + input: + bam_sr=os.path.join(RESULTS_DIR, "mapping/{mapper}_sr_{hybrid_assembler}/{mapper}_sr_{hybrid_assembler}.bam"), + bam_lr=os.path.join(RESULTS_DIR, "mapping/{mapper}_lr_{hybrid_assembler}/{mapper}_lr_{hybrid_assembler}.bam") + output: os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam") + conda: "../envs/binning.yaml" + shell: + """ + (date &&\ + samtools merge {output} {input.bam_sr} {input.bam_lr} &&\ + date) + """ diff --git a/test_troubleshooting_snakefiles/no_barcode_snakefile b/test_troubleshooting_snakefiles/no_barcode_snakefile new file mode 100755 index 0000000..d7c8681 --- /dev/null +++ b/test_troubleshooting_snakefiles/no_barcode_snakefile @@ -0,0 +1,219 @@ +# For running only the BASECALLING part of the workflow for ONT data + +import os +from tempfile import TemporaryDirectory + +configfile: "config/CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +RUNS=[config["runs"]["first"], + config["runs"]["second"]] +# config["runs"]["third"]] +BARCODES=config["barcodes"] +SAMPLES=["ONT3_MG_xx_Rashi_S11"] +sr_sample=["ONT3_MG_xx_Rashi_S11"] +# SAMPLES_ALL=config["samples"] +REFERENCES=["igc", "hg38"] +IGC_URI=config["igc"]["uri"] +HG38_URI=config["hg38"]["uri"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa_mem", "minimap2"] + +# Rule all for running the assembly and annotation +rule all: + input: + "basecall_merge_qc.done", + "basecall_merge_qc_NO_MOD.done" + +###### +# WRAPPER RULES +###### +# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule. +rule BASECALL_MERGE_QC: + input: + expand(os.path.join(RESULTS_DIR, "basecalled/merged/merged.fastq.gz"), barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats.txt"), barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{run}NanoStats.txt"), run=RUNS, barcode=BARCODES) + output: "basecall_merge_qc.done" + shell: + """ + touch {output} + """ + +rule BASECALL_MERGE_QC_NO_MOD: + input: + expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/merged.fastq.gz"), barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats_NO_MOD.txt"), barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{run}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES) + output: "basecall_merge_qc_NO_MOD.done" + shell: + """ + touch {output} + """ + + +###### +# RULES +###### +# Basecalling the multi-FAST5 files. +# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence. +rule guppy_gpu_basecall: + input: os.path.join(DATA_DIR, "multifast5/{run}") + output: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling.done") + log: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling") + threads: config["guppy_gpu"]["cpu_threads"] + shell: + """ + (date &&\ + {config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \ + touch {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule guppy_gpu_basecall_NO_MOD: + input: os.path.join(DATA_DIR, "multifast5/{run}") + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling.done") + log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling") + threads: config["guppy_gpu"]["cpu_threads"] + shell: + """ + (date &&\ + {config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][hac_config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \ + touch {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +# Demultiplex. +# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence. +# Since the 2019_GDB run did not have any barcodes, the demultiplexing step is not requried +#rule guppy_gpu_demux: +# input: rules.guppy_gpu_basecall.output +# output: os.path.join(RESULTS_DIR, "basecalled/{run}/demux.done") +# log: os.path.join(RESULTS_DIR, "basecalled/{run}/demux") +# threads: config["guppy_barcoder"]["threads"] +# shell: +# """ +# (date &&\ +# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \ +# touch {output} &&\ +# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) +# """ +#rule guppy_gpu_demux_NO_MOD: +# input: rules.guppy_gpu_basecall_NO_MOD.output +# output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux_NO_MOD.done") +# log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux") +# threads: config["guppy_barcoder"]["threads"] +# shell: +# """ +# (date &&\ +# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \ +# touch {output} &&\ +# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) +# """ + +# Merge multiple basecalled files, i.e., FASTQ files per barcode and run. Each multi-FAST5 file will produce a respective FASTQ file. +# TODO: Simplify output path to `basecalled/{run}/{barcode}.fastq.gz`. This will require to change `$(dirname {output})` to `$(dirname {output})/{wildcards.barcode}` +rule merge_individual_fastq_per_barcode: + input: os.path.join(RESULTS_DIR, "basecalled/{run}") + output: os.path.join(RESULTS_DIR, "basecalled/{run}/{run}.fastq.gz") + shell: + """ + date + cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output} + touch {output} + date + """ + +rule merge_individual_fastq_per_barcode_NO_MOD: + input: os.path.join(RESULTS_DIR, "basecalled/{run}") + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{run}.fastq.gz") + shell: + """ + date + cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output} + touch {output} + date + """ + +## Helper function to get the input in order to create a *single* fastq file for all runs with the *same* barcode +## TODO: Adjust according to TODO-merge_individual_fastq_per_barcode +#def get_all_fastq_for_barcode(wildcards): +# return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode) +# +#def get_all_fastq_for_barcode_NO_MOD(wildcards): +# return expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode) + +# Merge basecalled files (FASTQ) per barcode, i.e., combine reads from multiple runs according to their barcode. +rule get_single_fastq_per_barcode: + input: os.path.join(RESULTS_DIR, "basecalled/{run}/{run}.fastq.gz") + output: os.path.join(RESULTS_DIR, "basecalled/{run}/../merged/merged.fastq.gz") + shell: + """ + date + cat {input} > {output} + date + """ +rule get_single_fastq_per_barcode_NO_MOD: + input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{run}.fastq.gz") + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/../merged/merged.fastq.gz") + shell: + """ + date + cat {input} > {output} + date + """ + +# Get basic statistics of the ONT data; over all runs +# TODO: Simplify top of path: replace `lr` by `nanostat` +# TODO: Write more generic: have patterned `input: os.path.join(RESULTS_DIR, "basecalled/{prefix}/{barcode}.fastq.gz")` and patterned `output: os.path.join(RESULTS_DIR, "qc/nanostat/{prefix}/{barcode}/{barcode}NanoStats.txt") +# +rule nanostat: + input: os.path.join(RESULTS_DIR, "basecalled/merged/merged.fastq.gz") + output: os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats.txt") + log: os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats") + conda: "../envs/nanostat.yaml" + shell: + """ + (date &&\ + NanoStat --fastq {input} --outdir $(dirname {output}) --prefix merged -n $(basename -s "" {output}) &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule nanostat_NO_MOD: + input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/merged.fastq.gz") + output: os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats_NO_MOD.txt") + log: os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats_NO_MOD") + conda: "../envs/nanostat.yaml" + shell: + """ + (date &&\ + NanoStat --fastq {input} --outdir $(dirname {output}) --prefix merged -n $(basename -s "" {output}) &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +# TODO: Deprecated if TODO-nanostat is completed +# Get basic statistics of the ONT data; per run +rule nanostat_per_run: + input: os.path.join(RESULTS_DIR, "basecalled/{run}/{run}.fastq.gz") + output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{run}NanoStats.txt") + conda: "../envs/nanostat.yaml" + shell: + """ + date + NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.run} -n $(basename -s "" {output}) + date + """ + +rule nanostat_per_run_NO_MOD: + input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{run}.fastq.gz") + output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{run}NanoStats_NO_MOD.txt") + conda: "../envs/nanostat.yaml" + shell: + """ + date + NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.run} -n $(basename -s "" {output}) + date + """ + + diff --git a/test_troubleshooting_snakefiles/test_snakefile b/test_troubleshooting_snakefiles/test_snakefile new file mode 100755 index 0000000..5c0e2fd --- /dev/null +++ b/test_troubleshooting_snakefiles/test_snakefile @@ -0,0 +1,280 @@ +# For running only the BASECALLING part of the workflow for ONT data + +import os +from tempfile import TemporaryDirectory + +configfile: "config/CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +RUNS=[config["runs"]["first"], + config["runs"]["second"]] +# config["runs"]["third"]] +BARCODES=config["barcodes"] +SAMPLES=["ONT3_MG_xx_Rashi_S11"] +sr_sample=["ONT3_MG_xx_Rashi_S11"] +# SAMPLES_ALL=config["samples"] +REFERENCES=["igc", "hg38"] +IGC_URI=config["igc"]["uri"] +HG38_URI=config["hg38"]["uri"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa_mem", "minimap2"] + +# Rule all for running the assembly and annotation +rule all: + input: + "basecall_merge_qc.done", + "basecall_merge_qc_NO_MOD.done", + "dummy_folders_created.done" + +###### +# WRAPPER RULES +###### +# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule. +rule BASECALL_MERGE_QC: + input: +# expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES) + expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}.fastq.gz"), run=RUNS, barcode=BARCODES) +# expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES), +# expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES) + output: "basecall_merge_qc.done" + shell: + """ + touch {output} + """ + +rule BASECALL_MERGE_QC_NO_MOD: + input: +# expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz"), barcode=BARCODES) + expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}.fastq.gz"), run=RUNS, barcode=BARCODES) + # expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt"), barcode=BARCODES), + # expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES) + output: "basecall_merge_qc_NO_MOD.done" + shell: + """ + touch {output} + """ + +rule DUMMY_FOLDER: + input: +# expand(os.path.join(RESULTS_DIR, "basecalled/{run}/dummy_folder.done"), run=RUNS), + expand(os.path.join(RESULTS_DIR, "basecalled{type}/{run}/dummy_folder.done"), run=RUNS, type=["", "_NO_MOD"]) + output: "dummy_folders_created.done" + shell: + """ + touch {output} + """ + +###### +# RULES +###### +# Basecalling the multi-FAST5 files. +# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence. +checkpoint guppy_gpu_basecall: + input: os.path.join(DATA_DIR, "multifast5/{run}") + output: directory(os.path.join(RESULTS_DIR, "basecalled/{run}")) + log: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling") + threads: config["guppy_gpu"]["cpu_threads"] + shell: + """ + (date &&\ + {config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \ + touch {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +checkpoint guppy_gpu_basecall_NO_MOD: + input: os.path.join(DATA_DIR, "multifast5/{run}") + output: directory(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}")) + log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling") + wildcard_constraints: +# run='S.*' +# run='S.*^\/' +# run='\w+' +# run="^no.*" + threads: config["guppy_gpu"]["cpu_threads"] + shell: + """ + (date &&\ + {config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][hac_config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \ + touch {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +## Demultiplex. +## Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence. +#rule guppy_gpu_demux: +# input: rules.guppy_gpu_basecall.output +# output: os.path.join(RESULTS_DIR, "basecalled/{run}/demux.done") +# log: os.path.join(RESULTS_DIR, "basecalled/{run}/demux") +# threads: config["guppy_barcoder"]["threads"] +# shell: +# """ +# (date &&\ +# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \ +# touch {output} &&\ +# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) +# """ +#rule guppy_gpu_demux_NO_MOD: +# input: rules.guppy_gpu_basecall_NO_MOD.output +# output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux_NO_MOD.done") +# log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux") +# threads: config["guppy_barcoder"]["threads"] +# shell: +# """ +# (date &&\ +# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \ +# touch {output} &&\ +# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) +# """ + +# Modifications (IMPORTANT) +# Since 2019_GDB sample was "NOT" barcoded the demux folders are obsolete. +# The below section gathers all the fastq.gz files from basecalling into a "dummy" folder +# The "dummy" folder will be used downstream similar to the "barcode" folder used in the 2018 analyses +# The barcode in the CONFIG file will be given the name of this dummy folder +rule intermediate_basecalling: + input: os.path.join(RESULTS_DIR, "basecalled/{run}/{i}.fastq.gz") + output: os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz") + log: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode_{i}") + shell: + """ + (date &&\ + ln -s {input} {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule intermediate_NO_MOD_basecalling: + input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{i}.fastq.gz") + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_nobarcode/{i}.fastq.gz") + log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode_{i}") + shell: + """ + (date &&\ + ln -s {input} {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +def aggregate_dummy_basecalling(wildcards): + checkpoint_output = checkpoints.guppy_gpu_basecall.get(**wildcards).output[0] + return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{id}.fastq.gz"), + run=wildcards.run, + i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fastq.gz")).i) + + +def aggregate_dummy_basecalling_NO_MOD(wildcards): + dummy_output=checkpoints.guppy_gpu_basecall_NO_MOD(**wildcards).output[0] + return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz"), + run=wildcards.run, + i=glob_wildcards(os.path.join(dummy_output, "{i}.fastq.gz")).i) + + +# Merge multiple basecalled files, i.e., FASTQ files per barcode and run. Each multi-FAST5 file will produce a respective FASTQ file. +# TODO: Simplify output path to `basecalled/{run}/{barcode}.fastq.gz`. This will require to change `$(dirname {output})` to `$(dirname {output})/{wildcards.barcode}` +rule merge_individual_fastq_per_barcode: + input: aggregate_dummy_basecalling + output: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz") + shell: + """ + date + cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output} + touch {output} + date + """ + +rule merge_individual_fastq_per_barcode_NO_MOD: + input: aggregate_dummy_basecalling_NO_MOD + output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz") + shell: + """ + date + cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output} + touch {output} + date + """ + +## Helper function to get the input in order to create a *single* fastq file for all runs with the *same* barcode +## TODO: Adjust according to TODO-merge_individual_fastq_per_barcode +#def get_all_fastq_for_barcode(wildcards): +# return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode) +# +#def get_all_fastq_for_barcode_NO_MOD(wildcards): +# return expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode) +# +## Merge basecalled files (FASTQ) per barcode, i.e., combine reads from multiple runs according to their barcode. +#rule get_single_fastq_per_barcode: +# input: get_all_fastq_for_barcode +# output: os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz") +# shell: +# """ +# date +# cat {input} > {output} +# date +# """ +#rule get_single_fastq_per_barcode_NO_MOD: +# input: get_all_fastq_for_barcode_NO_MOD +# output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz") +# shell: +# """ +# date +# cat {input} > {output} +# date +# """ +# +## Get basic statistics of the ONT data; over all runs +## TODO: Simplify top of path: replace `lr` by `nanostat` +## TODO: Write more generic: have patterned `input: os.path.join(RESULTS_DIR, "basecalled/{prefix}/{barcode}.fastq.gz")` and patterned `output: os.path.join(RESULTS_DIR, "qc/nanostat/{prefix}/{barcode}/{barcode}NanoStats.txt") +## +#rule nanostat: +# input: rules.get_single_fastq_per_barcode.output +# output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt") +# log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats") +# conda: "../envs/nanostat.yaml" +# shell: +# """ +# (date &&\ +# NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\ +# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) +# """ +# +#rule nanostat_NO_MOD: +# input: rules.get_single_fastq_per_barcode_NO_MOD.output +# output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt") +# log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD") +# conda: "../envs/nanostat.yaml" +# shell: +# """ +# (date &&\ +# NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\ +# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) +# """ +# +## TODO: Deprecated if TODO-nanostat is completed +## Get basic statistics of the ONT data; per run +#rule nanostat_per_run: +# input: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz") +# output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt") +# wildcard_constraints: +# run='S.*' +# conda: "../envs/nanostat.yaml" +# shell: +# """ +# date +# NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) +# date +# """ +# +#rule nanostat_per_run_NO_MOD: +# input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz") +# output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt") +# wildcard_constraints: +# run='S.*' +# conda: "../envs/nanostat.yaml" +# shell: +# """ +# date +# NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) +# date +# """ +# +# -- GitLab