Skip to content
Snippets Groups Projects
Commit 915b2d75 authored by Susheel Busi's avatar Susheel Busi
Browse files

collected miscellaneous files into one folder

parent 9beaa36c
No related branches found
No related tags found
1 merge request!67WIP: Checkpoint snakefile
# For running only the BASECALLING part of the workflow for ONT data
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
RUNS=[config["runs"]["first"],
config["runs"]["second"]]
# config["runs"]["third"]]
BARCODES=config["barcodes"]
SAMPLES=["ONT3_MG_xx_Rashi_S11"]
sr_sample=["ONT3_MG_xx_Rashi_S11"]
# SAMPLES_ALL=config["samples"]
REFERENCES=["igc", "hg38"]
IGC_URI=config["igc"]["uri"]
HG38_URI=config["hg38"]["uri"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa_mem", "minimap2"]
# Rule all for running the assembly and annotation
rule all:
input:
"basecall_merge_qc.done",
"basecall_merge_qc_NO_MOD.done",
"dummy_folders_created.done"
######
# WRAPPER RULES
######
# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
rule BASECALL_MERGE_QC:
input:
expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES)
output: "basecall_merge_qc.done"
shell:
"""
touch {output}
"""
rule BASECALL_MERGE_QC_NO_MOD:
input:
expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES)
output: "basecall_merge_qc_NO_MOD.done"
shell:
"""
touch {output}
"""
rule DUMMY_FOLDER:
input:
# expand(os.path.join(RESULTS_DIR, "basecalled/{run}/dummy_folder.done"), run=RUNS),
expand(os.path.join(RESULTS_DIR, "basecalled{type}/{run}/dummy_folder.done"), run=RUNS, type=["", "_NO_MOD"])
output: "dummy_folders_created.done"
shell:
"""
touch {output}
"""
######
# RULES
######
# Basecalling the multi-FAST5 files.
# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
rule guppy_gpu_basecall:
input: os.path.join(DATA_DIR, "multifast5/{run}")
output: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling.done")
log: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling")
threads: config["guppy_gpu"]["cpu_threads"]
shell:
"""
(date &&\
{config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule guppy_gpu_basecall_NO_MOD:
input: os.path.join(DATA_DIR, "multifast5/{run}")
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling.done")
log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling")
threads: config["guppy_gpu"]["cpu_threads"]
shell:
"""
(date &&\
{config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][hac_config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# Demultiplex.
# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
rule guppy_gpu_demux:
input: rules.guppy_gpu_basecall.output
output: os.path.join(RESULTS_DIR, "basecalled/{run}/demux.done")
log: os.path.join(RESULTS_DIR, "basecalled/{run}/demux")
threads: config["guppy_barcoder"]["threads"]
shell:
"""
(date &&\
{config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule guppy_gpu_demux_NO_MOD:
input: rules.guppy_gpu_basecall_NO_MOD.output
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux_NO_MOD.done")
log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux")
threads: config["guppy_barcoder"]["threads"]
shell:
"""
(date &&\
{config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# Modifications (IMPORTANT)
# Since 2019_GDB sample was "NOT" barcoded the demux folders are obsolete.
# The below section gathers all the fastq.gz files from basecalling into a "dummy" folder
# The "dummy" folder will be used downstream similar to the "barcode" folder used in the 2018 analyses
# The barcode in the CONFIG file will be given the name of this dummy folder
rule create_dummy_folder_basecalling:
input: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling.done")
output:
out2=os.path.join(RESULTS_DIR, "basecalled/{run}/dummy_folder.done")
log: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode")
shell:
"""
(date &&\
mkdir $(dirname {input})/no_barcode &&\
cp -vrf $(dirname {input})/*.gz $(dirname {input})/no_barcode/. &&\
touch {output.out2} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# for file in $(dirname {input})/*.gz; do ln -s "$file" $(dirname {input})/no_barcode/`basename "$file"`; done &&\
rule create_dummy_folder_NO_MOD_basecalling:
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling.done")
output:
out2=os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/dummy_folder.done")
log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode")
shell:
"""
(date &&\
mkdir $(dirname {input})/no_barcode &&\
cp -vrf $(dirname {input})/*.gz $(dirname {input})/no_barcode/. &&\
touch {output.out2} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# for file in $(dirname {input})/*.gz; do ln -s "$file" $(dirname {input})/no_barcode/`basename "$file"`; done &&\
#
# Merge multiple basecalled files, i.e., FASTQ files per barcode and run. Each multi-FAST5 file will produce a respective FASTQ file.
# TODO: Simplify output path to `basecalled/{run}/{barcode}.fastq.gz`. This will require to change `$(dirname {output})` to `$(dirname {output})/{wildcards.barcode}`
rule merge_individual_fastq_per_barcode:
# input: rules.create_dummy_folder_basecalling.output.out1
input: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode")
output: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz")
shell:
"""
date
cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
touch {output}
date
"""
rule merge_individual_fastq_per_barcode_NO_MOD:
# input: rules.create_dummy_folder_NO_MOD_basecalling.output.out1
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode")
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz")
shell:
"""
date
cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
touch {output}
date
"""
# Helper function to get the input in order to create a *single* fastq file for all runs with the *same* barcode
# TODO: Adjust according to TODO-merge_individual_fastq_per_barcode
def get_all_fastq_for_barcode(wildcards):
return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
def get_all_fastq_for_barcode_NO_MOD(wildcards):
return expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
# Merge basecalled files (FASTQ) per barcode, i.e., combine reads from multiple runs according to their barcode.
rule get_single_fastq_per_barcode:
input: get_all_fastq_for_barcode
output: os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz")
shell:
"""
date
cat {input} > {output}
date
"""
rule get_single_fastq_per_barcode_NO_MOD:
input: get_all_fastq_for_barcode_NO_MOD
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz")
shell:
"""
date
cat {input} > {output}
date
"""
# Get basic statistics of the ONT data; over all runs
# TODO: Simplify top of path: replace `lr` by `nanostat`
# TODO: Write more generic: have patterned `input: os.path.join(RESULTS_DIR, "basecalled/{prefix}/{barcode}.fastq.gz")` and patterned `output: os.path.join(RESULTS_DIR, "qc/nanostat/{prefix}/{barcode}/{barcode}NanoStats.txt")
#
rule nanostat:
input: rules.get_single_fastq_per_barcode.output
output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt")
log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats")
conda: "../envs/nanostat.yaml"
shell:
"""
(date &&\
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule nanostat_NO_MOD:
input: rules.get_single_fastq_per_barcode_NO_MOD.output
output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt")
log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD")
conda: "../envs/nanostat.yaml"
shell:
"""
(date &&\
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# TODO: Deprecated if TODO-nanostat is completed
# Get basic statistics of the ONT data; per run
rule nanostat_per_run:
input: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz")
output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt")
wildcard_constraints:
run='S.*'
conda: "../envs/nanostat.yaml"
shell:
"""
date
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output})
date
"""
rule nanostat_per_run_NO_MOD:
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz")
output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt")
wildcard_constraints:
run='S.*'
conda: "../envs/nanostat.yaml"
shell:
"""
date
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output})
date
"""
# File for running ONT analyses
# default configuration file
configfile:"config/CONFIG.yaml"
# default executable for snakmake
shell.executable("bash")
# input settings
RUNS=config['runs']['first']
STEPS=config['steps']
# include rules for the workflows based on "steps" in the CONFIG.yaml file
# ONT analyses workflow
TARGETS = []
if 'assembly_annotation' in STEPS:
include: "workflows/checkpoint_assembly_annotation.smk"
TARGETS += ["assemble_and_coverage.done",
"annotate.done",
"basecall_merge_qc.done",
"coverage_of_references.done",
"prodigal_gene_call.done",
"diamond_proteins.done"]
if 'mmseq' in STEPS:
include: "workflows/mmseq.smk"
TARGETS += ["mmseq_comparison_for_ont.done"]
if 'metaT' in STEPS:
include: "workflows/metat.smk"
TARGETS += ["metaT_mapping_for_ONT.done"]
if 'mapping' in STEPS:
include: "workflows/mapping.smk"
TARGETS += ["mapping_for_binning.done"]
if 'binning' in STEPS:
include: "workflows/binning.smk"
TARGETS += ["binning_for_ont.done"]
if 'taxonomy' in STEPS:
include: "workflows/taxonomy.smk"
TARGETS += ["taxonomy_for_ont.done"]
if 'analysis' in STEPS:
include: "workflows/analysis.smk"
TARGETS += ["analysis.done"]
#else:
# raise Exception('You are not serious. No input data')
# print("No input data provided")
rule all:
input:
TARGETS
# Workflow for running mapping steps of different assemblers, and mappers for the Binning workflow
#shell.executable("/bin/bash")
#shell.prefix("source ~/.bashrc; ")
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
SAMPLES=["flye", "megahit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
# specify which rules to run
#include:
# '../rules/MAPPING_RULES'
# Rule all for mapping
rule MAPPING:
input:
expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), hybrid_assembler=HYBRID_ASSEMBLER),
expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_{reads}_{hybrid_assembler}/{mapper}_{reads}_{hybrid_assembler}.bam"), mapper=MAPPERS, reads=["sr", "lr"], hybrid_assembler=HYBRID_ASSEMBLER),
expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), hybrid_assembler=HYBRID_ASSEMBLER),
expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam"), mapper=MAPPERS, hybrid_assembler=HYBRID_ASSEMBLER)
output:
touch('mapping_for_binning.done')
#######################
## Mapping to hybrid ##
#######################
rule build_hybrid_bwa_index:
input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER)
output:
# done=touch("bwa_index.done")
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.sa")
params:
prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}")
log: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwa_index")
conda: "../envs/bwa.yaml"
shell: """
(date &&\
bwa index {input} -p {params.prefix} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_short_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"),
query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"),
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER)
output:
os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}")
log: os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/sr_contigs.bwa_mem_samtools")
conda: "../envs/bwa.yaml"
threads:
config["bwa"]["threads"]
# wildcard_constraints:
# sample='\w+'
shell: """
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_long_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_lr=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER)
output:
os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}"),
log: os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/lr_contigs.bwa_mem_samtools")
conda: "../envs/bwa.yaml"
threads: config["bwa"]["threads"]
# wildcard_constraints:
# sample='\w+'
shell: """
(date &&\
bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_build_minimap2_long_read_contigs_ont_index:
input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi")
conda: "../envs/minimap2.yaml"
shell:
"""
date
minimap2 -x map-ont -d {output} {input}
date
"""
rule hybrid_map_long_reads_to_hybrid_contigs_w_minimap2:
input:
query=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode="no_barcode"),
index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}.bam")
params:
prefix=os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}")
conda: "../envs/minimap2.yaml"
threads: config["minimap2"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/metaspades.minimap2_samtools")
shell: """
(date &&\
minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_short_reads_to_hybrid_contigs_w_minimap2:
input:
query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"),
query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"),
index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode="no_barcode", hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}.bam")
params:
prefix=os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}")
conda: "../envs/minimap2.yaml"
threads: config["minimap2"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/metaspades.minimap2_samtools")
shell:
"""
(date &&\
minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output}
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_merge_bam_files_for_metaspades:
input:
bam_sr=os.path.join(RESULTS_DIR, "mapping/{mapper}_sr_{hybrid_assembler}/{mapper}_sr_{hybrid_assembler}.bam"),
bam_lr=os.path.join(RESULTS_DIR, "mapping/{mapper}_lr_{hybrid_assembler}/{mapper}_lr_{hybrid_assembler}.bam")
output: os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam")
conda: "../envs/binning.yaml"
shell:
"""
(date &&\
samtools merge {output} {input.bam_sr} {input.bam_lr} &&\
date)
"""
# For running only the BASECALLING part of the workflow for ONT data
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
RUNS=[config["runs"]["first"],
config["runs"]["second"]]
# config["runs"]["third"]]
BARCODES=config["barcodes"]
SAMPLES=["ONT3_MG_xx_Rashi_S11"]
sr_sample=["ONT3_MG_xx_Rashi_S11"]
# SAMPLES_ALL=config["samples"]
REFERENCES=["igc", "hg38"]
IGC_URI=config["igc"]["uri"]
HG38_URI=config["hg38"]["uri"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa_mem", "minimap2"]
# Rule all for running the assembly and annotation
rule all:
input:
"basecall_merge_qc.done",
"basecall_merge_qc_NO_MOD.done"
######
# WRAPPER RULES
######
# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
rule BASECALL_MERGE_QC:
input:
expand(os.path.join(RESULTS_DIR, "basecalled/merged/merged.fastq.gz"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats.txt"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{run}NanoStats.txt"), run=RUNS, barcode=BARCODES)
output: "basecall_merge_qc.done"
shell:
"""
touch {output}
"""
rule BASECALL_MERGE_QC_NO_MOD:
input:
expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/merged.fastq.gz"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats_NO_MOD.txt"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{run}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES)
output: "basecall_merge_qc_NO_MOD.done"
shell:
"""
touch {output}
"""
######
# RULES
######
# Basecalling the multi-FAST5 files.
# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
rule guppy_gpu_basecall:
input: os.path.join(DATA_DIR, "multifast5/{run}")
output: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling.done")
log: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling")
threads: config["guppy_gpu"]["cpu_threads"]
shell:
"""
(date &&\
{config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule guppy_gpu_basecall_NO_MOD:
input: os.path.join(DATA_DIR, "multifast5/{run}")
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling.done")
log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling")
threads: config["guppy_gpu"]["cpu_threads"]
shell:
"""
(date &&\
{config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][hac_config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# Demultiplex.
# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
# Since the 2019_GDB run did not have any barcodes, the demultiplexing step is not requried
#rule guppy_gpu_demux:
# input: rules.guppy_gpu_basecall.output
# output: os.path.join(RESULTS_DIR, "basecalled/{run}/demux.done")
# log: os.path.join(RESULTS_DIR, "basecalled/{run}/demux")
# threads: config["guppy_barcoder"]["threads"]
# shell:
# """
# (date &&\
# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
# touch {output} &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
#rule guppy_gpu_demux_NO_MOD:
# input: rules.guppy_gpu_basecall_NO_MOD.output
# output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux_NO_MOD.done")
# log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux")
# threads: config["guppy_barcoder"]["threads"]
# shell:
# """
# (date &&\
# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
# touch {output} &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
# Merge multiple basecalled files, i.e., FASTQ files per barcode and run. Each multi-FAST5 file will produce a respective FASTQ file.
# TODO: Simplify output path to `basecalled/{run}/{barcode}.fastq.gz`. This will require to change `$(dirname {output})` to `$(dirname {output})/{wildcards.barcode}`
rule merge_individual_fastq_per_barcode:
input: os.path.join(RESULTS_DIR, "basecalled/{run}")
output: os.path.join(RESULTS_DIR, "basecalled/{run}/{run}.fastq.gz")
shell:
"""
date
cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
touch {output}
date
"""
rule merge_individual_fastq_per_barcode_NO_MOD:
input: os.path.join(RESULTS_DIR, "basecalled/{run}")
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{run}.fastq.gz")
shell:
"""
date
cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
touch {output}
date
"""
## Helper function to get the input in order to create a *single* fastq file for all runs with the *same* barcode
## TODO: Adjust according to TODO-merge_individual_fastq_per_barcode
#def get_all_fastq_for_barcode(wildcards):
# return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
#
#def get_all_fastq_for_barcode_NO_MOD(wildcards):
# return expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
# Merge basecalled files (FASTQ) per barcode, i.e., combine reads from multiple runs according to their barcode.
rule get_single_fastq_per_barcode:
input: os.path.join(RESULTS_DIR, "basecalled/{run}/{run}.fastq.gz")
output: os.path.join(RESULTS_DIR, "basecalled/{run}/../merged/merged.fastq.gz")
shell:
"""
date
cat {input} > {output}
date
"""
rule get_single_fastq_per_barcode_NO_MOD:
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{run}.fastq.gz")
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/../merged/merged.fastq.gz")
shell:
"""
date
cat {input} > {output}
date
"""
# Get basic statistics of the ONT data; over all runs
# TODO: Simplify top of path: replace `lr` by `nanostat`
# TODO: Write more generic: have patterned `input: os.path.join(RESULTS_DIR, "basecalled/{prefix}/{barcode}.fastq.gz")` and patterned `output: os.path.join(RESULTS_DIR, "qc/nanostat/{prefix}/{barcode}/{barcode}NanoStats.txt")
#
rule nanostat:
input: os.path.join(RESULTS_DIR, "basecalled/merged/merged.fastq.gz")
output: os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats.txt")
log: os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats")
conda: "../envs/nanostat.yaml"
shell:
"""
(date &&\
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix merged -n $(basename -s "" {output}) &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule nanostat_NO_MOD:
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/merged.fastq.gz")
output: os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats_NO_MOD.txt")
log: os.path.join(RESULTS_DIR, "qc/lr/merged/merged_NanoStats_NO_MOD")
conda: "../envs/nanostat.yaml"
shell:
"""
(date &&\
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix merged -n $(basename -s "" {output}) &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# TODO: Deprecated if TODO-nanostat is completed
# Get basic statistics of the ONT data; per run
rule nanostat_per_run:
input: os.path.join(RESULTS_DIR, "basecalled/{run}/{run}.fastq.gz")
output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{run}NanoStats.txt")
conda: "../envs/nanostat.yaml"
shell:
"""
date
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.run} -n $(basename -s "" {output})
date
"""
rule nanostat_per_run_NO_MOD:
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{run}.fastq.gz")
output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{run}NanoStats_NO_MOD.txt")
conda: "../envs/nanostat.yaml"
shell:
"""
date
NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.run} -n $(basename -s "" {output})
date
"""
# For running only the BASECALLING part of the workflow for ONT data
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
RUNS=[config["runs"]["first"],
config["runs"]["second"]]
# config["runs"]["third"]]
BARCODES=config["barcodes"]
SAMPLES=["ONT3_MG_xx_Rashi_S11"]
sr_sample=["ONT3_MG_xx_Rashi_S11"]
# SAMPLES_ALL=config["samples"]
REFERENCES=["igc", "hg38"]
IGC_URI=config["igc"]["uri"]
HG38_URI=config["hg38"]["uri"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa_mem", "minimap2"]
# Rule all for running the assembly and annotation
rule all:
input:
"basecall_merge_qc.done",
"basecall_merge_qc_NO_MOD.done",
"dummy_folders_created.done"
######
# WRAPPER RULES
######
# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
rule BASECALL_MERGE_QC:
input:
# expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES)
expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}.fastq.gz"), run=RUNS, barcode=BARCODES)
# expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES),
# expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES)
output: "basecall_merge_qc.done"
shell:
"""
touch {output}
"""
rule BASECALL_MERGE_QC_NO_MOD:
input:
# expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz"), barcode=BARCODES)
expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}.fastq.gz"), run=RUNS, barcode=BARCODES)
# expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt"), barcode=BARCODES),
# expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES)
output: "basecall_merge_qc_NO_MOD.done"
shell:
"""
touch {output}
"""
rule DUMMY_FOLDER:
input:
# expand(os.path.join(RESULTS_DIR, "basecalled/{run}/dummy_folder.done"), run=RUNS),
expand(os.path.join(RESULTS_DIR, "basecalled{type}/{run}/dummy_folder.done"), run=RUNS, type=["", "_NO_MOD"])
output: "dummy_folders_created.done"
shell:
"""
touch {output}
"""
######
# RULES
######
# Basecalling the multi-FAST5 files.
# Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
checkpoint guppy_gpu_basecall:
input: os.path.join(DATA_DIR, "multifast5/{run}")
output: directory(os.path.join(RESULTS_DIR, "basecalled/{run}"))
log: os.path.join(RESULTS_DIR, "basecalled/{run}/basecalling")
threads: config["guppy_gpu"]["cpu_threads"]
shell:
"""
(date &&\
{config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
checkpoint guppy_gpu_basecall_NO_MOD:
input: os.path.join(DATA_DIR, "multifast5/{run}")
output: directory(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}"))
log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/NO_MOD_basecalling")
wildcard_constraints:
# run='S.*'
# run='S.*^\/'
# run='\w+'
# run="^no.*"
threads: config["guppy_gpu"]["cpu_threads"]
shell:
"""
(date &&\
{config[guppy_gpu][bin]} --input_path {input} --save_path $(dirname {output}) --config {config[guppy_gpu][hac_config]} --cpu_threads_per_caller {threads} -x {config[guppy_gpu][gpu_device]} --disable_pings --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --chunk_size {config[guppy_gpu][chunk_size]} --chunks_per_runner {config[guppy_gpu][chunks_per_runner]} --gpu_runners_per_device {config[guppy_gpu][runners_per_device]} --num_callers {config[guppy_gpu][num_callers]} && \
touch {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
## Demultiplex.
## Currently requires dummy target, but would be nice to have this be a "real" file target. However, it seems that files are filled incrementally, hence there is no check if the whole process was successful or not if checking only for file presence.
#rule guppy_gpu_demux:
# input: rules.guppy_gpu_basecall.output
# output: os.path.join(RESULTS_DIR, "basecalled/{run}/demux.done")
# log: os.path.join(RESULTS_DIR, "basecalled/{run}/demux")
# threads: config["guppy_barcoder"]["threads"]
# shell:
# """
# (date &&\
# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
# touch {output} &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
#rule guppy_gpu_demux_NO_MOD:
# input: rules.guppy_gpu_basecall_NO_MOD.output
# output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux_NO_MOD.done")
# log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/demux")
# threads: config["guppy_barcoder"]["threads"]
# shell:
# """
# (date &&\
# {config[guppy_barcoder][bin]} --input_path $(dirname {input}) --save_path $(dirname {output}) -t {threads} --compress_fastq --records_per_fastq {config[guppy_gpu][records_per_fastq]} --trim_barcodes && \
# touch {output} &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
# Modifications (IMPORTANT)
# Since 2019_GDB sample was "NOT" barcoded the demux folders are obsolete.
# The below section gathers all the fastq.gz files from basecalling into a "dummy" folder
# The "dummy" folder will be used downstream similar to the "barcode" folder used in the 2018 analyses
# The barcode in the CONFIG file will be given the name of this dummy folder
rule intermediate_basecalling:
input: os.path.join(RESULTS_DIR, "basecalled/{run}/{i}.fastq.gz")
output: os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz")
log: os.path.join(RESULTS_DIR, "basecalled/{run}/no_barcode_{i}")
shell:
"""
(date &&\
ln -s {input} {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule intermediate_NO_MOD_basecalling:
input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{i}.fastq.gz")
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_nobarcode/{i}.fastq.gz")
log: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/no_barcode_{i}")
shell:
"""
(date &&\
ln -s {input} {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
def aggregate_dummy_basecalling(wildcards):
checkpoint_output = checkpoints.guppy_gpu_basecall.get(**wildcards).output[0]
return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{id}.fastq.gz"),
run=wildcards.run,
i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fastq.gz")).i)
def aggregate_dummy_basecalling_NO_MOD(wildcards):
dummy_output=checkpoints.guppy_gpu_basecall_NO_MOD(**wildcards).output[0]
return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/no_nobarcode/{i}.fastq.gz"),
run=wildcards.run,
i=glob_wildcards(os.path.join(dummy_output, "{i}.fastq.gz")).i)
# Merge multiple basecalled files, i.e., FASTQ files per barcode and run. Each multi-FAST5 file will produce a respective FASTQ file.
# TODO: Simplify output path to `basecalled/{run}/{barcode}.fastq.gz`. This will require to change `$(dirname {output})` to `$(dirname {output})/{wildcards.barcode}`
rule merge_individual_fastq_per_barcode:
input: aggregate_dummy_basecalling
output: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz")
shell:
"""
date
cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
touch {output}
date
"""
rule merge_individual_fastq_per_barcode_NO_MOD:
input: aggregate_dummy_basecalling_NO_MOD
output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz")
shell:
"""
date
cat $(find $(dirname {output}) -name "*.fastq.gz" | sort) > {output}
touch {output}
date
"""
## Helper function to get the input in order to create a *single* fastq file for all runs with the *same* barcode
## TODO: Adjust according to TODO-merge_individual_fastq_per_barcode
#def get_all_fastq_for_barcode(wildcards):
# return expand(os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
#
#def get_all_fastq_for_barcode_NO_MOD(wildcards):
# return expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz"), run=RUNS, barcode=wildcards.barcode)
#
## Merge basecalled files (FASTQ) per barcode, i.e., combine reads from multiple runs according to their barcode.
#rule get_single_fastq_per_barcode:
# input: get_all_fastq_for_barcode
# output: os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz")
# shell:
# """
# date
# cat {input} > {output}
# date
# """
#rule get_single_fastq_per_barcode_NO_MOD:
# input: get_all_fastq_for_barcode_NO_MOD
# output: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz")
# shell:
# """
# date
# cat {input} > {output}
# date
# """
#
## Get basic statistics of the ONT data; over all runs
## TODO: Simplify top of path: replace `lr` by `nanostat`
## TODO: Write more generic: have patterned `input: os.path.join(RESULTS_DIR, "basecalled/{prefix}/{barcode}.fastq.gz")` and patterned `output: os.path.join(RESULTS_DIR, "qc/nanostat/{prefix}/{barcode}/{barcode}NanoStats.txt")
##
#rule nanostat:
# input: rules.get_single_fastq_per_barcode.output
# output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt")
# log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats")
# conda: "../envs/nanostat.yaml"
# shell:
# """
# (date &&\
# NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
#
#rule nanostat_NO_MOD:
# input: rules.get_single_fastq_per_barcode_NO_MOD.output
# output: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt")
# log: os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD")
# conda: "../envs/nanostat.yaml"
# shell:
# """
# (date &&\
# NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output}) &&\
# date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
# """
#
## TODO: Deprecated if TODO-nanostat is completed
## Get basic statistics of the ONT data; per run
#rule nanostat_per_run:
# input: os.path.join(RESULTS_DIR, "basecalled/{run}/{barcode}/{barcode}.fastq.gz")
# output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt")
# wildcard_constraints:
# run='S.*'
# conda: "../envs/nanostat.yaml"
# shell:
# """
# date
# NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output})
# date
# """
#
#rule nanostat_per_run_NO_MOD:
# input: os.path.join(RESULTS_DIR, "basecalled_NO_MOD/{run}/{barcode}/{barcode}.fastq.gz")
# output: os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt")
# wildcard_constraints:
# run='S.*'
# conda: "../envs/nanostat.yaml"
# shell:
# """
# date
# NanoStat --fastq {input} --outdir $(dirname {output}) --prefix {wildcards.barcode} -n $(basename -s "" {output})
# date
# """
#
#
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment