Valentina Galata
--- a/workflow_zymo/Snakefile 0 → 100644

+ 530

− 0
+++ b/workflow_zymo/Snakefile 0 → 100644

+ 530

− 0
+# Pipeline for extra analysis of the zymo data:
+#   to be used after the main workflow (i.e. workflow/) and the report workflow (i.e. workflow_report/)
+#
+# Example call:
+#   snakemake -s workflow_zymo/Snakefile --configfile config/zymo/config.yaml --use-conda --conda-prefix /scratch/users/vgalata/miniconda3/ONT_pilot --cores 1 -rpn
+
+##############################
+# MODULES
+import os
+import re
+import pandas
+from scripts.utils import assembler_pairs
+
+##############################
+# CONFIG
+include:
+    "rules/init.smk"
+
+##################################################
+# RULES
+##################################################
+rule all:
+    input:
+        refs=os.path.join(RESULTS_DIR, "ref_genomes", "zymo.dmnd"),
+        kegg=os.path.join(DB_DIR, "kegg.tsv"),
+        mummer=expand(
+            os.path.join(RESULTS_DIR, "extra/zymo/mummer/{rtype_tool}.dnadiff.report"),
+            rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
+        ),
+        diamond=expand(
+            os.path.join(RESULTS_DIR, "extra/zymo/diamond/{rtype_tool}.tsv"),
+            rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
+        ),
+        summary=expand(
+            os.path.join(RESULTS_DIR, "extra/{rtype_tool}.genes.tsv"),
+            rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
+        ),
+        summary_pdf=expand(
+            os.path.join(RESULTS_DIR, "extra/{rtype_tool}.genes.pdf"),
+            rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
+        )
+        #####
+        # expand(os.path.join(RESULTS_DIR, "analysis/sibeliaz/{combi}_output/blocks_coords.gff"),
+        #     combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS]),
+        # expand(os.path.join(RESULTS_DIR, "analysis/zymo_sibeliaz/zymo_{rtype_tool}_output/blocks_coords.gff"), 
+        #     rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS])
+
+rule zymo_download:
+    output:
+        os.path.join(RESULTS_DIR, "ref_genomes", "{org}.fna")
+    log:
+        "logs/zymo.download.{org}.log"
+    params:
+        url=lambda wildcards: config["ref_genomes"][wildcards.org]
+    message:
+        "Zymo: download reference: {wildcards.org}"
+    shell:
+       "(date && wget {params.url} -O {output} && date) &> {log}"
+
+rule zymo_prodigal:
+    input:
+        rules.zymo_download.output
+    output:
+        faa=os.path.join(RESULTS_DIR, "ref_genomes", "{org}.faa"),
+        gbk=os.path.join(RESULTS_DIR, "ref_genomes", "{org}.gbk")
+    log:
+        "logs/zymo.prodigal.{org}.log"
+    conda:
+        os.path.join(ENV_DIR, "prodigal.yaml")
+    message:
+        "Zymo: predict proteins: {wildcards.org}"
+    shell:
+       "(date && prodigal -i {input} -o {output.gbk} -a {output.faa} && sed -i 's/^>/>{wildcards.org}__/' {output.faa} && date) &> {log}"
+
+rule zymo_concat_fna:
+    input:
+        expand(os.path.join(RESULTS_DIR, "ref_genomes", "{org}.fna"), org=config["ref_genomes"].keys())
+    output:
+        os.path.join(RESULTS_DIR, "ref_genomes", "zymo.fna")
+    message:
+        "Zymo: concat references"
+    shell:
+        "for f in {input}; do sed \"s/^>/>$(basename -s '.fna' ${{f}})__/\" ${{f}}; done > {output}"
+
+rule zymo_concat_faa:
+    input:
+        expand(os.path.join(RESULTS_DIR, "ref_genomes", "{org}.faa"), org=config["ref_genomes"].keys())
+    output:
+        os.path.join(RESULTS_DIR, "ref_genomes", "zymo.faa")
+    message:
+        "Zymo: concat reference proteins"
+    shell:
+        "cat {input} > {output}"
+
+rule zymo_diamondDB:
+    input:
+        rules.zymo_concat_faa.output
+    output:
+        os.path.join(RESULTS_DIR, "ref_genomes", "zymo.dmnd")
+    log:
+        "logs/zymo.diamondDB.log"
+    threads:
+        config["diamond"]["threads"]
+    conda:
+        os.path.join(ENV_DIR, "diamond.yaml")
+    message:
+        "Zymo: reference DIAMOND DB"
+    shell:
+       "(date && db={output} && diamond makedb --in {input} --db ${{db%.*}} --threads {threads} && date) &> {log}"
+
+rule zymo_diamond:
+    input: 
+        faa=os.path.join(RESULTS_DIR, "annotation/prodigal/{rtype}/{tool}/proteins.faa"),
+        db=rules.zymo_diamondDB.output
+    output:
+        daa=os.path.join(RESULTS_DIR, "extra/zymo/diamond/{rtype}_{tool}.daa"),
+        tsv=os.path.join(RESULTS_DIR, "extra/zymo/diamond/{rtype}_{tool}.tsv")
+    log:
+        "logs/zymo.diamond.{rtype}.{tool}.log",
+    wildcard_constraints:
+        rtype="|".join(READ_TYPES),
+        tool="|".join(ASSEMBLERS)
+    threads:
+        config["diamond"]["threads"]
+    params:
+        blastp="--id 50 --query-cover 50 --subject-cover 50 --more-sensitive",
+        outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
+    conda:
+        os.path.join(ENV_DIR, "diamond.yaml")
+    message:
+        "Zymo: DIAMOND search: {input}"
+    shell:
+        "(date && "
+        "db={input.db} && "
+        "daa={output.daa} && "
+        "diamond blastp -q {input.faa} --db ${{db%.*}} --out ${{daa}} --threads {threads} --outfmt 100 {params.blastp} && "
+        # --max-target-seqs 1: best N matches that pass the e-value treshold
+        "diamond view --daa ${{daa%.*}} --max-target-seqs 1 --threads {threads} --outfmt {params.outfmt} --out {output.tsv} && "
+        "date) &> {log}"
+
+rule zymo_mummer_dnadiff:
+    input:
+        ref=rules.zymo_concat_fna.output,
+        asm=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.POLISHED.fasta")
+    output:
+        out=os.path.join(RESULTS_DIR, "extra/zymo/mummer/{rtype}_{tool}.dnadiff.report"),
+    log:
+        "logs/zymo.mummer.dnadiff.{rtype}.{tool}.log"
+    wildcard_constraints:
+        rtype="|".join(READ_TYPES),
+        tool="|".join(ASSEMBLERS)
+    threads: 1
+    conda:
+        os.path.join(ENV_DIR, "mummer.yaml")
+    message:
+        "Zymo: MUMMER: dnadiff: {input}"
+    shell:
+        "(date && "
+        "pout={output.out} && "
+        "dnadiff -p ${{pout%.*}} {input.ref} {input.asm} && "
+        "date) &> {log}"
+
+
+rule proc_hmmer_tblout:
+    input:
+        os.path.join(RESULTS_DIR, "annotation/hmm/kegg/{rtype}/{tool}/hmm.tblout")
+    output:
+        temp(os.path.join(RESULTS_DIR, "annotation/hmm/kegg/{rtype}/{tool}/hmm.tsv"))
+    shell:
+        "grep -v '^#' {input} | sed 's/\s\+/\\t/g' | cut -f1-18 > {output}"
+
+rule kegg_ko_ids:
+    input:
+        os.path.join(DB_DIR, config["hmm"]["kegg"])
+    output:
+        os.path.join(DB_DIR, "kegg.ids")
+    shell:
+        "grep \"^NAME\" {input} | sed 's/^NAME\s\+//' > {output}"
+
+rule kegg_api_ko:
+    input:
+        os.path.join(DB_DIR, "kegg.ids")
+    output:
+        os.path.join(DB_DIR, "kegg.tsv")
+    run:
+        def query_ko(kos, concat=False):
+            # TODO: move to utils.py
+            # https://www.kegg.jp/kegg/rest/keggapi.html
+            #   GET: The input is limited up to 10 entries
+            import re
+            import requests
+            from time import sleep
+            from random import random
+
+            assert len(kos) <= 10 and len(kos) > 0, "Number of KOs must be between 1 and 10, i.e. not {}".format(len(KO))
+            for ko in kos:
+                assert re.match("K", ko), f"KO does not start with a K: {ko}"
+
+            kegg_info = dict()
+            for ko in kos:
+                kegg_info[ko] = {"kegg_acc": ko, "kegg_name": "NA", "kegg_def": "NA", "kegg_pathways": set()}
+
+            s = requests.Session()
+
+            s_get = s.get("http://rest.kegg.jp/get/{kos}".format(kos="+".join(kos)))
+            assert s_get.status_code == 200, f"Could not get data"
+            s_pws = s.get("http://rest.kegg.jp/link/pathway/{kos}".format(kos="+".join(kos)))
+            assert s_pws.status_code == 200, f"Could not get linked pathways"
+
+            ko = None
+            for line in s_get.text.splitlines():
+                line = line.strip()
+                # if re.match("///", line): # new entry
+                #     ko = None
+                if re.match("ENTRY", line):
+                    ko = re.sub("\s+","\t", line).split("\t")[1]
+                    assert ko in kegg_info
+                if re.match("NAME", line):
+                    assert kegg_info[ko]["kegg_name"] == "NA"
+                    kegg_info[ko]["kegg_name"] = re.sub("^NAME\s+", "", line)
+                elif re.match("DEFINITION", line):
+                    assert kegg_info[ko]["kegg_def"] == "NA"
+                    kegg_info[ko]["kegg_def"] = re.sub("^DEFINITION\s+", "", line)
+
+            for line in s_pws.text.split("\n"):
+                if line and re.search("path:map", line):
+                    ko, pathway = line.split("\t")
+                    ko = re.sub("^ko:","",ko)
+                    assert ko in kegg_info
+                    kegg_info[ko]["kegg_pathways"].add(pathway)
+
+            if concat:
+                for ko in kos:
+                    kegg_info[ko]["kegg_pathways"] = ";".join(list(kegg_info[ko]["kegg_pathways"])) if len(kegg_info[ko]["kegg_pathways"]) > 0 else "NA"
+
+            # sleep for at least 1 sec
+            sleep(1 + random())
+            return kegg_info
+        
+        with open(input[0], "r") as ifile, open(output[0], "w") as ofile:
+            header = ["kegg_acc", "kegg_name", "kegg_def", "kegg_pathways"]
+            ofile.write("\t".join(header) + "\n")
+            kos = []
+            for line in ifile:
+                if len(kos) == 10:
+                    q = query_ko(kos=kos, concat=True)
+                    for v in q.values():
+                        ofile.write("\t".join([v[k] for k in header]) + "\n"); ofile.flush()
+                    kos = []
+                ko = line.strip().split("_")[0]
+                kos.append(ko)
+            if len(kos) > 0:
+                q = query_ko(kos=kos, concat=True)
+                for v in q.values():
+                    ofile.write("\t".join([v[k] for k in header]) + "\n"); ofile.flush()
+
+rule assembly_gene_summary:
+    input:
+        # FASTAs
+        contigs=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.POLISHED.fasta"),
+        proteins=os.path.join(RESULTS_DIR, "annotation/prodigal/{rtype}/{tool}/proteins.faa"),
+        # taxonomy
+        kraken2=expand(
+            os.path.join(RESULTS_DIR, "taxonomy/kraken2/{{rtype}}.{{tool}}.{db}.labels.txt"),
+            db=config["kraken2"]["db"].keys()
+        ),
+        kaiju=expand(
+            os.path.join(RESULTS_DIR, "taxonomy/kaiju/{{rtype}}.{{tool}}.{db}.tsv"),
+            db=config["kaiju"]["db"].keys()
+        ),
+        # coverage
+        genecov=expand(
+            os.path.join(RESULTS_DIR, "mapping/{mtype}/{{rtype}}/{{tool}}/ASSEMBLY.POLISHED.sr.cov.pergene"),
+            mtype=META_TYPES
+        ),
+        # reference
+        refhits=os.path.join(RESULTS_DIR, "extra/zymo/diamond/{rtype}_{tool}.tsv"),
+        # unique/matches proteins
+        cdhit=lambda wildcards: expand(
+            os.path.join(RESULTS_DIR, "analysis/cdhit/{rtype1_tool1}__{{rtype}}_{{tool}}.faa"),
+            rtype1_tool1=["%s_%s" % (r,t) for r,t in READ_ASSEMBLERS if r != wildcards.rtype or t != wildcards.tool]
+        ),
+        diamond=lambda wildcards: expand(
+            os.path.join(RESULTS_DIR, "analysis/diamond/{rtype1_tool1}__{{rtype}}_{{tool}}.tsv"),
+            rtype1_tool1=["%s_%s" % (r,t) for r,t in READ_ASSEMBLERS if r != wildcards.rtype or t != wildcards.tool]
+        ),
+        kegg=os.path.join(RESULTS_DIR, "annotation/hmm/kegg/{rtype}/{tool}/hmm.tsv"),
+        kegg_info=os.path.join(DB_DIR, "kegg.tsv"),
+    output:
+        os.path.join(RESULTS_DIR, "extra/{rtype}_{tool}.genes.tsv")
+    wildcard_constraints:
+        rtype="|".join(READ_TYPES),
+        tool="|".join(ASSEMBLERS)
+    message:
+        "Zymo: Gene summary: {wildcards.rtype}, {wildcards.tool}"
+    run:
+        from scripts.utils import parse_diamond_tsv, parse_prodigal_faa_headers
+
+        def parse_assembly_fna(ifile):
+            # TODO: move to utils.py
+            import re
+            import pandas
+            contigs = {}
+            with open(ifile, "r") as ifile:
+                cid = None
+                for line in ifile:
+                    line = line.strip()
+                    if re.match(">", line):
+                        cid = re.sub("^>", "", line.split(" ")[0])
+                        assert cid not in contigs
+                        contigs[cid] = {"length": 0}
+                    else:
+                        contigs[cid]["length"] += len(line)
+            contigs = pandas.DataFrame.from_dict(contigs, orient='index')
+            return contigs
+
+        def parse_kaiju_labels(ifile, columns=["class_flag", "seq_id", "tax_id", "best_match_length", "best_match_tax_ids", "best_match_accs", "best_match_seq", "tax_lineage"]):
+            # TODO: move to utils.py
+            # https://github.com/bioinformatics-centre/kaiju#output-format
+            #   either C or U, indicating whether the read is classified or unclassified.
+            #   name of the read
+            #   NCBI taxon identifier of the assigned taxon
+            #   the length or score of the best match used for classification
+            #   the taxon identifiers of all database sequences with the best match
+            #   the accession numbers of all database sequences with the best match
+            #   matching fragment sequence(s)
+            import pandas
+            df = pandas.read_csv(ifile, sep="\t", header=None, names=columns)
+            df = df[["class_flag", "seq_id", "tax_id", "tax_lineage"]]
+            df.set_index(["seq_id"], drop=True, inplace=True, verify_integrity=True)
+            return df
+
+        def parse_kraken2_labels(ifile, columns=["class_flag", "seq_id", "taxon", "seq_length", "matches"]):
+            # TODO: move to utils.py
+            # https://github.com/DerrickWood/kraken2/wiki/Manual#standard-kraken-output-format
+            #   "C"/"U": a one letter code indicating that the sequence was either classified or unclassified.
+            #   The sequence ID, obtained from the FASTA/FASTQ header.
+            #   The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence is unclassified.
+            #   The length of the sequence in bp.
+            #   A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s).
+            import pandas
+            df = pandas.read_csv(ifile, sep="\t", header=None, names=columns)
+            df = df.assign(tax_name=df["taxon"].apply(lambda x: re.search("(?P<tname>^.*) \(taxid (?P<tid>[0-9]+)\)$", x).group("tname")))
+            df = df.assign(tax_id=df["taxon"].apply(lambda x: re.search("(?P<tname>^.*) \(taxid (?P<tid>[0-9]+)\)$", x).group("tid")))
+            df.drop(columns=["matches", "taxon"], inplace=True)
+            df.set_index(["seq_id"], drop=True, inplace=True, verify_integrity=True)
+            return df
+
+        def parse_genecov_table(ifile):
+            # TODO: move to utils.py
+            import pandas
+            df = pandas.read_csv(ifile, sep="\t", header=0)
+            df.set_index(["prot_id"], drop=True, inplace=True, verify_integrity=True)
+            return df
+
+        def parse_hmmer_tblout(ifile, columns=["tname","tacc","qname","qacc","fs_evalue","fs_score","fs_bias","b1d_evalue","b1d_score","b1d_bias","exp","reg","clu","ov","env","dom","rep","inc"]):
+            # TODO: move to utils.py
+            import pandas
+            df = pandas.read_csv(ifile, sep="\t", header=None, names=columns)
+            df.drop(columns=["tacc","qacc","exp","reg","clu","ov","env","dom","rep","inc"], inplace=True)
+            # best hit per target
+            df = df.sort_values(by=["tname", "b1d_evalue"], axis=0, ascending=[True, True]).groupby("tname").head(1).reset_index(drop=True)
+            df.set_index(["tname"], drop=True, inplace=True, verify_integrity=True)
+            return df
+
+        # contig IDs and their length
+        contigs = parse_assembly_fna(input.contigs)
+        # print(contigs.head())
+        # genes/proteins
+        genes = parse_prodigal_faa_headers(input.proteins)
+        genes["gene_length"] = abs(genes["end"] - genes["start"]) + 1
+        genes = genes.assign(partial=genes["info"].apply(lambda x: not re.search(";partial=00;", x)))
+        # print(genes.head())
+        # gene cov
+        genecov = {}
+        for ifile in input.genecov:
+            mtype = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(ifile))))
+            genecov[mtype] = parse_genecov_table(ifile)[["ave_cov"]]
+            # print(genecov[mtype].head())
+        # kraken2
+        kraken2 = {}
+        for ifile in input.kraken2:
+            db = os.path.basename(ifile).split(".")[2]
+            kraken2[db] = parse_kraken2_labels(ifile)
+            # print(kraken2[db].head())
+        # kaiju
+        kaiju = {}
+        for ifile in input.kaiju:
+            db = os.path.basename(ifile).split(".")[2]
+            kaiju[db] = parse_kaiju_labels(ifile)
+            # print(kaiju[db].head())
+        # hits to ref. genomes
+        refhits = parse_diamond_tsv(input.refhits)
+        refhits.set_index(["qseqid"], drop=True, inplace=True, verify_integrity=True)
+        # print(refhits.head())
+        # unique prot.s from cdhit
+        unique = {}
+        for ifile in input.cdhit:
+            tool2 = os.path.basename(ifile).split("__")[0].split("_")[1]
+            unique[tool2] = list(parse_prodigal_faa_headers(ifile)["prot_id"].apply(lambda x: "__".join(x.split("__")[2:])))
+            # print("cdhit: {}: {}: {}".format(tool2, len(unique[tool2]), unique[tool2][0:5]))
+        # matched prots. from diamond hits
+        matched = {}
+        for ifile in input.diamond:
+            tool2 = os.path.basename(ifile).split("__")[0].split("_")[1]
+            matched[tool2] = parse_diamond_tsv(ifile)
+            matched[tool2].set_index(["qseqid"], drop=True, inplace=True, verify_integrity=True)
+            # print(matched[tool2].head())
+        # kegg
+        kegg = parse_hmmer_tblout(input.kegg)
+        kegg_info = pandas.read_csv(input.kegg_info, sep="\t", header=0, index_col=0)
+        # print(kegg.head())
+        # summary
+        summary = genes.copy()
+        summary.set_index(["prot_id"], drop=False, inplace=True, verify_integrity=True)
+        summary["contig_length"] = summary["contig_id"].apply(lambda x: contigs.loc[x,"length"])
+        for k, v in genecov.items():
+            summary["ave_cov_%s" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"ave_cov"])
+        for k, v in kraken2.items():
+            summary["kraken2_%s_name" % k] = summary["contig_id"].apply(lambda x: v.loc[x,"tax_name"])
+            summary["kraken2_%s_id" % k]   = summary["contig_id"].apply(lambda x: v.loc[x,"tax_id"])
+        for k, v in kaiju.items():
+            summary["kaiju_%s_lineage" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"tax_lineage"])
+            summary["kaiju_%s_id" % k]      = summary["prot_id"].apply(lambda x: v.loc[x,"tax_id"])
+        summary["ref_sseqid"]   = summary["prot_id"].apply(lambda x: refhits.loc[x,"sseqid"] if x in refhits.index else None)
+        summary["ref_pident"]   = summary["prot_id"].apply(lambda x: refhits.loc[x,"pident"] if x in refhits.index else None)
+        summary["ref_qslratio"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"qs_len_ratio"] if x in refhits.index else None)
+        summary["ref_qcov"]     = summary["prot_id"].apply(lambda x: refhits.loc[x,"qcov"] if x in refhits.index else None)
+        summary["ref_scov"]     = summary["prot_id"].apply(lambda x: refhits.loc[x,"scov"] if x in refhits.index else None)
+        for k, v in unique.items():
+            summary["cdhit_unique_%s" % k] = False
+            summary.loc[v,"cdhit_unique_%s" % k] = True
+        for k, v in matched.items():
+            summary["diamond_unique_%s" % k] = True
+            summary.loc[v.index,"diamond_unique_%s" % k] = False
+        summary["kegg"]         = summary["prot_id"].apply(lambda x: kegg.loc[x,"qname"] if x in kegg.index else None)
+        summary["kegg_evalue"]  = summary["prot_id"].apply(lambda x: kegg.loc[x,"b1d_evalue"] if x in kegg.index else None)
+        summary["kegg_name"]    = summary["kegg"].apply(lambda x: kegg_info.loc[x.split("_")[0],"kegg_name"] if x is not None else None)
+        summary["kegg_def"]     = summary["kegg"].apply(lambda x: kegg_info.loc[x.split("_")[0],"kegg_def"] if x is not None else None)
+        summary["kegg_pathways"]= summary["kegg"].apply(lambda x: kegg_info.loc[x.split("_")[0],"kegg_pathways"] if x is not None else None)
+        # print(summary.head())
+        # save
+        summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False, na_rep="NA")
+
+rule assembly_gene_summary_plots:
+    input:
+        genes=os.path.join(RESULTS_DIR, "extra/{rtype}_{tool}.genes.tsv")
+    output:
+        pdf=os.path.join(RESULTS_DIR, "extra/{rtype}_{tool}.genes.pdf")
+    log:
+        out=os.path.join(RESULTS_DIR, "extra/{rtype}_{tool}.genes.pdf.log")
+    params:
+        const=os.path.join(SRC_DIR, "const.R"),
+        utils=os.path.join(SRC_DIR, "utils.R")
+    conda:
+        os.path.join(ENV_DIR, "r.yaml")
+    script:
+        os.path.join(SRC_DIR, "genes.R")
+
+# #####################
+# # Genome comparison #
+# #####################
+# rule sibeliaz:
+#     input:
+#         asm1=os.path.join(DATA_DIR, "assembly/{rtype1}/{tool1}/ASSEMBLY.POLISHED.fasta"),
+#         asm2=os.path.join(DATA_DIR, "assembly/{rtype2}/{tool2}/ASSEMBLY.POLISHED.fasta")
+#     output:
+#         os.path.join(RESULTS_DIR, "analysis/sibeliaz/{rtype1}_{tool1}__{rtype2}_{tool2}_output/blocks_coords.gff")
+#     log:
+#         out="logs/sibeliaz.{rtype1}.{tool1}.{rtype2}.{tool2}.out.log",
+#         err="logs/sibeliaz.{rtype1}.{tool1}.{rtype2}.{tool2}.err.log"
+#     threads:
+#         config["sibeliaz"]["threads"]
+#     params:
+#         kmers=config["sibeliaz"]["kmer"],
+#         memory=config["sibeliaz"]["mem"]
+#     wildcard_constraints:
+#         rtype1="|".join(READ_TYPES),
+#         rtype2="|".join(READ_TYPES),
+#         tool1="|".join(ASSEMBLERS),
+#         tool2="|".join(ASSEMBLERS)
+#     conda:
+#         os.path.join(ENV_DIR, "sibeliaz.yaml")
+#     message:
+#         "Genome comparision: Sibeliaz: {input.asm1} {input.asm2}"        
+#     shell:
+#        "(date && "
+#        "sibeliaz -n -t {threads} -k {params.kmers} -f {params.memory} -o $(dirname {output}) {input.asm1} {input.asm2} && "
+#        "date) 2> {log.err} > {log.out}"
+
+# rule concatenate:
+#     input:
+#         expand(os.path.join(REF_GENOMES_DIR, "{org}.fna"), org=config["ref_genomes"].keys())
+#     output:
+#         os.path.join(RESULTS_DIR,"reference/mock_genomes.fasta")
+#     log:
+#         out="logs/concat.out.log",
+#         err="logs/concat.err.log"
+#     shell:
+#         "(date && "
+#         "cat {input} > {output} && "
+#         "date) 2> {log.err} > {log.out}"
+
+# rule zymo_sibeliaz:
+#     input:
+#         zymo=rules.concatenate.output,
+#         asm2=os.path.join(DATA_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.POLISHED.fasta")
+#     output:
+#         os.path.join(RESULTS_DIR, "analysis/zymo_sibeliaz/zymo_{rtype}_{tool}_output/blocks_coords.gff")
+#     log:
+#         out="logs/sibeliaz.{rtype}.{tool}.out.log",
+#         err="logs/sibeliaz.{rtype}.{tool}.err.log"
+#     threads:
+#         config["sibeliaz"]["threads"]
+#     params:
+#         kmers=config["sibeliaz"]["kmer"],
+#         memory=config["sibeliaz"]["mem"]
+#     wildcard_constraints:
+#         rtype1="|".join(READ_TYPES),
+#         rtype2="|".join(READ_TYPES),
+#         tool1="|".join(ASSEMBLERS),
+#         tool2="|".join(ASSEMBLERS)
+#     conda:
+#         os.path.join(ENV_DIR, "sibeliaz.yaml")
+#     message:
+#         "Genome comparision: Sibeliaz: {input.zymo} {input.asm2}"        
+#     shell:
+#        "(date && "
+#        "sibeliaz -n -t {threads} -k {params.kmers} -f {params.memory} -o $(dirname {output}) {input.zymo} {input.asm2} && "
+#        "date) 2> {log.err} > {log.out}"
+\ No newline at end of file