-
Valentina Galata authoredValentina Galata authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
assembly_annotation.smk 8.93 KiB
# For running the ASSEMBLY and ANNOTATION part of the ONT pipeline
# specify which rules to run
include:
'../rules/assembly_annotation.smk'
# Rule all for running the assembly and annotation
# rule ASSEMBLY_ANNOTATION:
# input:
# "assemble_and_coverage.done",
# "basecall_merge_qc.done",
# "basecall_merge_qc_NO_MOD.done",
# "coverage_of_references.done",
# "annotate.done",
# "prodigal_gene_call.done",
# "diamond_proteins.done"
######
# WRAPPER RULES
######
# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
rule BASECALL_MERGE_QC:
input:
expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES),
# expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample=["ONT3_MG_xx_Rashi_S11"]),
expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}.fastp.{report_type}"), sr_sample=SAMPLES, report_type=["html", "json"]),
expand(os.path.join(RESULTS_DIR, "qc/sr/fastqc/{sr_sample}/{sr_sample}_R{orientation}_001.fastp.fastqc.html"), sr_sample=SAMPLES, orientation=["1", "2"])
output:
"basecall_merge_qc.done"
shell:
"touch {output}"
rule BASECALL_MERGE_QC_NO_MOD:
input:
expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES)
output:
"basecall_merge_qc_NO_MOD.done"
shell:
"touch {output}"
rule COVERAGE_OF_REFERENCES:
input:
expand(os.path.join(RESULTS_DIR, "genomecov/sr/minimap2/{sr_sample}-x-{reference}.avg_cov.txt"), sr_sample=SAMPLES, reference=REFERENCES),
expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_sample}-x-{reference}.avg_cov.txt"), sr_sample=SAMPLES, reference=REFERENCES),
expand(os.path.join(RESULTS_DIR, "genomecov/lr/merged/{barcode}/{barcode}-x-{reference}.avg_cov.txt"), barcode=BARCODES, reference=REFERENCES)
output:
"coverage_of_references.done"
shell:
"touch {output}"
rule TEST:
input:
# Commented out as it crashes due to a Key error
#expand(os.path.join(RESULTS_DIR, "assembly/rebaler/lr/merged/{barcode}/lr_{barcode}-sr_contigs_{sample}.fna"), barcode=BARCODES, sample=SAMPLES)
expand(os.path.join(RESULTS_DIR, "annotation/diamond/lr/merged/{barcode}.diamond.daa"), barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sample}/contigs.fna"), barcode=BARCODES, sample="ONT3_MG_xx_Rashi_S11", assembler="metaspades_hybrid")
rule TEST_PRODIGAL:
input:
expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr/merged/{barcode}/assembly.faa"), assembler="flye", barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/polished/{polisher}/round_{round}/lr/merged/{barcode}/assembly_polished.faa"), assembler="flye", polisher=["medaka", "racon"], round="01", barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sample}/final.contigs.faa"), assembler=["megahit", "metaspades"], sample="ONT3_MG_xx_Rashi_S11"),
expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sample="ONT3_MG_xx_Rashi_S11")
output:
"prodigal_gene_call.done"
shell:
"touch {output}"
rule TEST_DIAMOND:
input:
expand(os.path.join(RESULTS_DIR, "annotation/diamond/{assembler}/lr/merged/{barcode}/assembly.tsv"), assembler="flye", barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "annotation/diamond/{assembler}/polished/{polisher}/round_{round}/lr/merged/{barcode}/assembly_polished.tsv"), assembler="flye", polisher=["medaka", "racon"], round="01", barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "annotation/diamond/{assembler}/{sample}/final.contigs.tsv"), assembler="megahit", sample="ONT3_MG_xx_Rashi_S11"),
expand(os.path.join(RESULTS_DIR, "annotation/diamond/{assembler}/lr_{barcode}-sr_{sample}/contigs.tsv"), assembler="metaspades_hybrid", barcode=BARCODES, sample="ONT3_MG_xx_Rashi_S11")
output:
"diamond_proteins.done"
shell:
"touch {output}"
rule ASSEMBLE_AND_COVERAGE:
input:
# long reads on long read contigs
expand(os.path.join(RESULTS_DIR, "genomecov/lr/merged/{barcode}/{barcode}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, assembler=ASSEMBLERS),
# short reads on long read contigs
expand(os.path.join(RESULTS_DIR, "genomecov/sr/minimap2/{sample}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, assembler=ASSEMBLERS),
# short reads on long read contigs
expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sample}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, assembler=ASSEMBLERS),
#expand(os.path.join(RESULTS_DIR, "assembly/operams/lr_{barcode}-sr_{sample}/contigs.fna"), sample=SAMPLES, barcode=BARCODES),
#expand(os.path.join(RESULTS_DIR, "mapping/lr/merged/{barcode}/{barcode}_reads-x-{barcode}-{assembler}_contigs.bam"), barcode=BARCODES, assembler="flye"),
# short reads on hybrid contigs
#expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sample}_reads-x-lr_{barcode}_sr_{sample}-{assembler}_contigs.avg_cov.txt"), sample=SAMPLES, barcode=BARCODES, assembler="operams"),
# long reads on hybrid contigs
#expand(os.path.join(RESULTS_DIR, "genomecov/lr/bwa_mem/{barcode}_reads-x-lr_{barcode}_sr_{sample}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, sample=SAMPLES, assembler="operams"),
expand(os.path.join(RESULTS_DIR, "assembly/flye/lr/merged/{barcode}/assembly.fna"), barcode=BARCODES),
# short reads on short read contigs
expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_sample}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), sr_sample=SAMPLES, assembler=["megahit", "metaspades"]),
# assemble short reads with metaspades
# expand(os.path.join(RESULTS_DIR, "assembly/metaspades/{sr_sample}/contigs.fna"), sr_sample=SAMPLES),
# long reads on short read contigs
expand(os.path.join(RESULTS_DIR, "genomecov/lr/bwa_mem/{barcode}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, sr_sample=SAMPLES, assembler="megahit")
output:
"assemble_and_coverage.done"
shell:
"touch {output}"
rule ANNOTATE:
input:
# short reads coverage and diversity estimate
expand(os.path.join(RESULTS_DIR, "annotation/nonpareil/{sr_sample}_R{orientation}_001.fastp.npa"), sr_sample=["ONT3_MG_xx_Rashi_S11"], orientation=["1", "2"])
# compute methylation frequencies
# expand(os.path.join(RESULTS_DIR, "annotation/methylation/{assembler}/nanopolish/round_{round}/lr/merged/{barcode}/methylation_frequencies.tsv"), assembler="flye", barcode=BARCODES, round="01")
output:
"annotate.done"
shell:
"touch {output}"
rule POLISH_AND_COVERAGE:
input:
# short read-polished contigs from OPERA-MS
#expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sample}/contigs.fna"), sample=SAMPLES, barcode=BARCODES, assembler="operams"),
#long read polished contigs
#expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/polished/{polisher}/round_{round}/lr/merged/{barcode}/assembly_polished.fna"), assembler="flye", polisher="medaka", round="01", barcode=BARCODES),
# map long reads to long read polished contigs
expand(os.path.join(RESULTS_DIR, "genomecov/polished/{polisher}/round_{round}/lr/merged/{barcode}/{barcode}_reads-x-{barcode}-{assembler}_polished_contigs.avg_cov.txt"), assembler="flye", polisher="medaka", round="01", barcode=BARCODES),
# create nanonoplish index
#expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz.index.readdb"), barcode=BARCODES)
# polish flye contigs with nanopolish
#expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/polished/nanopolish/round_{round}/lr/merged/{barcode}/assembly_polished.fna"), assembler="flye", round="01", barcode=BARCODES),
# polish using racon
expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/polished/racon/round_{round}/lr/merged/{barcode}/assembly_polished.fna"), assembler="flye", round="01", barcode=BARCODES)