Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
assembly_annotation.smk 8.93 KiB
# For running the ASSEMBLY and ANNOTATION part of the ONT pipeline

# specify which rules to run
include:
    '../rules/assembly_annotation.smk'

# Rule all for running the assembly and annotation
# rule ASSEMBLY_ANNOTATION:
#     input:
#         "assemble_and_coverage.done",
#         "basecall_merge_qc.done",
#         "basecall_merge_qc_NO_MOD.done",
#         "coverage_of_references.done",
#         "annotate.done",
#         "prodigal_gene_call.done",
#         "diamond_proteins.done"

######
# WRAPPER RULES
######
# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
rule BASECALL_MERGE_QC:
    input:
        expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES),
#        expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample=["ONT3_MG_xx_Rashi_S11"]),
        expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}.fastp.{report_type}"), sr_sample=SAMPLES, report_type=["html", "json"]),
        expand(os.path.join(RESULTS_DIR, "qc/sr/fastqc/{sr_sample}/{sr_sample}_R{orientation}_001.fastp.fastqc.html"), sr_sample=SAMPLES, orientation=["1", "2"])
    output:
        "basecall_merge_qc.done"
    shell:
        "touch {output}"

rule BASECALL_MERGE_QC_NO_MOD:
    input:
        expand(os.path.join(RESULTS_DIR, "basecalled_NO_MOD/merged/{barcode}.fastq.gz"), barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats_NO_MOD.txt"), barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats_NO_MOD.txt"), run=RUNS, barcode=BARCODES)
    output:
        "basecall_merge_qc_NO_MOD.done"
    shell:
        "touch {output}"

rule COVERAGE_OF_REFERENCES:
    input: 
        expand(os.path.join(RESULTS_DIR, "genomecov/sr/minimap2/{sr_sample}-x-{reference}.avg_cov.txt"), sr_sample=SAMPLES, reference=REFERENCES),
        expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_sample}-x-{reference}.avg_cov.txt"), sr_sample=SAMPLES, reference=REFERENCES),
        expand(os.path.join(RESULTS_DIR, "genomecov/lr/merged/{barcode}/{barcode}-x-{reference}.avg_cov.txt"), barcode=BARCODES, reference=REFERENCES)
    output:
        "coverage_of_references.done"
    shell:
        "touch {output}"

rule TEST:
    input:
        # Commented out as it crashes due to a Key error
        #expand(os.path.join(RESULTS_DIR, "assembly/rebaler/lr/merged/{barcode}/lr_{barcode}-sr_contigs_{sample}.fna"), barcode=BARCODES, sample=SAMPLES)
        expand(os.path.join(RESULTS_DIR, "annotation/diamond/lr/merged/{barcode}.diamond.daa"), barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sample}/contigs.fna"), barcode=BARCODES, sample="ONT3_MG_xx_Rashi_S11", assembler="metaspades_hybrid")

rule TEST_PRODIGAL:
    input:
        expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr/merged/{barcode}/assembly.faa"), assembler="flye", barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/polished/{polisher}/round_{round}/lr/merged/{barcode}/assembly_polished.faa"), assembler="flye", polisher=["medaka", "racon"], round="01", barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sample}/final.contigs.faa"), assembler=["megahit", "metaspades"], sample="ONT3_MG_xx_Rashi_S11"),
        expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sample="ONT3_MG_xx_Rashi_S11")
    output:
        "prodigal_gene_call.done"
    shell:
        "touch {output}"

rule TEST_DIAMOND:
    input:
        expand(os.path.join(RESULTS_DIR, "annotation/diamond/{assembler}/lr/merged/{barcode}/assembly.tsv"), assembler="flye", barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "annotation/diamond/{assembler}/polished/{polisher}/round_{round}/lr/merged/{barcode}/assembly_polished.tsv"), assembler="flye", polisher=["medaka", "racon"], round="01", barcode=BARCODES),
        expand(os.path.join(RESULTS_DIR, "annotation/diamond/{assembler}/{sample}/final.contigs.tsv"), assembler="megahit", sample="ONT3_MG_xx_Rashi_S11"),
        expand(os.path.join(RESULTS_DIR, "annotation/diamond/{assembler}/lr_{barcode}-sr_{sample}/contigs.tsv"), assembler="metaspades_hybrid", barcode=BARCODES, sample="ONT3_MG_xx_Rashi_S11")
    output:
        "diamond_proteins.done"
    shell:
        "touch {output}"

rule ASSEMBLE_AND_COVERAGE:
    input:
        # long reads on long read contigs
        expand(os.path.join(RESULTS_DIR, "genomecov/lr/merged/{barcode}/{barcode}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, assembler=ASSEMBLERS),
        # short reads on long read contigs
        expand(os.path.join(RESULTS_DIR, "genomecov/sr/minimap2/{sample}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, assembler=ASSEMBLERS),
        # short reads on long read contigs
        expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sample}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, assembler=ASSEMBLERS),
        #expand(os.path.join(RESULTS_DIR, "assembly/operams/lr_{barcode}-sr_{sample}/contigs.fna"), sample=SAMPLES, barcode=BARCODES),
        #expand(os.path.join(RESULTS_DIR, "mapping/lr/merged/{barcode}/{barcode}_reads-x-{barcode}-{assembler}_contigs.bam"), barcode=BARCODES, assembler="flye"),
        # short reads on hybrid contigs
        #expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sample}_reads-x-lr_{barcode}_sr_{sample}-{assembler}_contigs.avg_cov.txt"), sample=SAMPLES, barcode=BARCODES, assembler="operams"),
        # long reads on hybrid contigs
        #expand(os.path.join(RESULTS_DIR, "genomecov/lr/bwa_mem/{barcode}_reads-x-lr_{barcode}_sr_{sample}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, sample=SAMPLES, assembler="operams"),
        expand(os.path.join(RESULTS_DIR, "assembly/flye/lr/merged/{barcode}/assembly.fna"), barcode=BARCODES),
        # short reads on short read contigs
        expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_sample}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), sr_sample=SAMPLES, assembler=["megahit", "metaspades"]),
        # assemble short reads with metaspades
       # expand(os.path.join(RESULTS_DIR, "assembly/metaspades/{sr_sample}/contigs.fna"), sr_sample=SAMPLES),
        # long reads on short read contigs
        expand(os.path.join(RESULTS_DIR, "genomecov/lr/bwa_mem/{barcode}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, sr_sample=SAMPLES, assembler="megahit")
    output:
        "assemble_and_coverage.done"
    shell:
        "touch {output}"

rule ANNOTATE:
    input:
        # short reads coverage and diversity estimate
        expand(os.path.join(RESULTS_DIR, "annotation/nonpareil/{sr_sample}_R{orientation}_001.fastp.npa"), sr_sample=["ONT3_MG_xx_Rashi_S11"], orientation=["1", "2"])
        # compute methylation frequencies
#        expand(os.path.join(RESULTS_DIR, "annotation/methylation/{assembler}/nanopolish/round_{round}/lr/merged/{barcode}/methylation_frequencies.tsv"), assembler="flye", barcode=BARCODES, round="01")
    output:
        "annotate.done"
    shell:
        "touch {output}"
        
rule POLISH_AND_COVERAGE:
    input:
        # short read-polished contigs from OPERA-MS
        #expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sample}/contigs.fna"), sample=SAMPLES, barcode=BARCODES, assembler="operams"),
        #long read polished contigs
        #expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/polished/{polisher}/round_{round}/lr/merged/{barcode}/assembly_polished.fna"), assembler="flye", polisher="medaka", round="01", barcode=BARCODES),
        # map long reads to long read polished contigs
        expand(os.path.join(RESULTS_DIR, "genomecov/polished/{polisher}/round_{round}/lr/merged/{barcode}/{barcode}_reads-x-{barcode}-{assembler}_polished_contigs.avg_cov.txt"), assembler="flye", polisher="medaka", round="01", barcode=BARCODES),
        # create nanonoplish index
        #expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz.index.readdb"), barcode=BARCODES)
        # polish flye contigs with nanopolish
        #expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/polished/nanopolish/round_{round}/lr/merged/{barcode}/assembly_polished.fna"), assembler="flye", round="01", barcode=BARCODES),
        # polish using racon
        expand(os.path.join(RESULTS_DIR, "assembly/{assembler}/polished/racon/round_{round}/lr/merged/{barcode}/assembly_polished.fna"), assembler="flye", round="01", barcode=BARCODES)