report: added mappability (flagstat) (issue #27, issue #23)

75c540bc · Valentina Galata · 1fdd1f1e · 75c540bc · 75c540bc · 75c540bc
Commit 75c540bc authored 4 years ago by Valentina Galata
--- a/workflow/scripts/utils.py
+++ b/workflow/scripts/utils.py
@@ -209,4 +209,20 @@ def merge_rgi_reports(dfs, summary_col, summary_type):
            )
    summary["col"]  = summary_col
    summary["type"] = summary_type
+    return summary
+
+def parse_samtools_flagstat_report(ifile_path):
+    import re
+    summary = {}
+    with open(ifile_path, "r") as ifile:
+        for line in ifile:
+            if re.search("^\d+ \+ \d+ in total \(", line):
+                line_re = re.search(re.compile("^(?P<passed>\d+) \+ (?P<failed>\d+) .*$"), line)
+                assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path)
+                summary["total"] = int(line_re.group("passed"))
+            elif re.search("^\d+ \+ \d+ mapped \(", line):
+                line_re = re.search(re.compile("^(?P<passed>\d+) \+ (?P<failed>\d+) .*$"), line)
+                assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path)
+                summary["mapped"] = int(line_re.group("passed"))
+        summary["mapped_pct"] = 100 * summary["mapped"] / summary["total"]
    return summary
\ No newline at end of file
--- a/workflow_report/Snakefile
+++ b/workflow_report/Snakefile
-# snakemake -s report/Snakefile --configfile config/GDB/config.yaml --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline --cores 1 -rpn
+# snakemake -s workflow_report/Snakefile --configfile config/GDB/config.yaml --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline --cores 1 -rpn
 # References:
 #   rmarkdown: https://bookdown.org/yihui/rmarkdown/

@@ -31,8 +31,15 @@ workdir:
 RESULTS_DIR = config["results_dir"]
 DB_DIR = config["db_dir"]

+# Input
+# INPUT_G_FAST5 = find_fast5(config["data"]["metag"]["ont"]["files"], config["data"]["metag"]["ont"]["dirs"]) # TODO: consider a different approach
+INPUT_G_SR    = list(config["data"]["metag"]["sr"].values())
+INPUT_T_SR    = []
+if config["data"]["metat"]["sr"]["r1"] and config["data"]["metat"]["sr"]["r2"]:
+    INPUT_T_SR = list(config["data"]["metat"]["sr"].values())
+
 # Assemblers and read types
-# META_TYPES      = ["metag", "metat"] if INPUT_T_SR else ["metag"]
+META_TYPES      = ["metag", "metat"] if INPUT_T_SR else ["metag"]
 READ_TYPES      = list(config["assemblers"].keys()) # list of read types
 ASSEMBLERS      = [y for x in config["assemblers"].values() for y in x] # list of all assemblers
 READ_ASSEMBLERS = [y for x in [[(k, vv) for vv in v] for k, v in config["assemblers"].items()] for y in x] # list of (read type, assembler)
@@ -78,7 +85,9 @@ rule render_report:
        plasflow=os.path.join(RESULTS_DIR, "annotation/plasflow/summary.tsv"),
        rgi=os.path.join(RESULTS_DIR, "annotation/rgi/summary.tsv"),
        casc=os.path.join(RESULTS_DIR, "annotation/casc/summary.tsv"),
-        minced=os.path.join(RESULTS_DIR, "annotation/minced/summary.tsv")
+        minced=os.path.join(RESULTS_DIR, "annotation/minced/summary.tsv"),
+        mappability_metag=os.path.join(RESULTS_DIR, "mapping/metag/mappability.tsv"),
+        mappability_metat=os.path.join(RESULTS_DIR, "mapping/metat/mappability.tsv"),
    output:
        html=os.path.join(RESULTS_DIR, "report/report.html"),
        pdf_quast=os.path.join(RESULTS_DIR, "report/fig_quast.pdf"),
@@ -95,91 +104,3 @@ rule render_report:
        os.path.join(ENV_DIR, "r.yaml")
    script:
        os.path.join(SRC_DIR, "report_render.R")
-
-# rule fig_mmseq_upsetr:
-#     input:
-#         overlap_sizes=config["fig_mmseq_upsetr"]["input"]["overlap_sizes"]
-#     output:
-#         pdf=FIG_MMSEQ_UPSETR
-#     log:
-#         FIG_MMSEQ_UPSETR + ".log"
-#     params:
-#         utils=config["utils"],
-#         width=config["fig_mmseq_upsetr"]["width"],
-#         height=config["fig_mmseq_upsetr"]["height"]
-#     conda:
-#         "envs/r.yml"
-#     script:
-#         config["fig_mmseq_upsetr"]["script"]
-
-# rule fig_partial_genes:
-#     input:
-#         counts=config["fig_partial_genes"]["input"]["counts"]
-#     output:
-#         pdf=FIG_PARTIAL_GENES
-#     log:
-#         FIG_PARTIAL_GENES + ".log"
-#     params:
-#         utils=config["utils"],
-#         width=config["fig_partial_genes"]["width"],
-#         height=config["fig_partial_genes"]["height"]
-#     conda:
-#         "envs/r.yml"
-#     script:
-#         config["fig_partial_genes"]["script"]
-
-# rule fig_crispr_data:
-#     input:
-#         expand(
-#             os.path.join(config["data_path"], "results/analysis/crispr/minced/{asm_tool}.txt"),
-#             asm_tool=config["assemblers"]
-#         ),
-#         expand(
-#             os.path.join(config["data_path"], "results/analysis/crispr/casc/{asm_tool}_casc_output/{asm_tool}.results.txt"),
-#             asm_tool=config["assemblers"]
-#         )
-#     output:
-#         config["fig_crispr"]["input"]["stats"]
-#     run:
-#         import os
-#         import re
-#         import pandas
-#         from src.utils import parse_minced_report, parse_casc_report
-#         summary = []
-#         for ifile_path in input:
-#             ifile_df = None
-#             # info from file path
-#             crispr_tool = None
-#             if re.search("minced", ifile_path):
-#                 crispr_tool = "minced"
-#             elif re.search("casc", ifile_path):
-#                 crispr_tool = "casc"
-#             asm_tool = os.path.basename(ifile_path).split(".")[0]
-#             # file content
-#             if crispr_tool == "minced":
-#                 ifile_df = pandas.DataFrame(parse_minced_report(ifile_path))
-#             elif crispr_tool == "casc":
-#                 ifile_df = parse_casc_report(ifile_path)
-#             ifile_df = ifile_df.assign(crispr_tool=crispr_tool)
-#             ifile_df = ifile_df.assign(asm_tool=asm_tool)
-#             summary.append(ifile_df)
-#         # concat
-#         summary = pandas.concat(objs=summary, axis="index")
-#         # wrote to file
-#         summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False)
-
-# rule fig_crispr:
-#     input:
-#         stats=config["fig_crispr"]["input"]["stats"]
-#     output:
-#         pdf=FIG_CRISPR
-#     log:
-#         FIG_CRISPR + ".log"
-#     params:
-#         utils=config["utils"],
-#         width=config["fig_crispr"]["width"],
-#         height=config["fig_crispr"]["height"]
-#     conda:
-#         "envs/r.yml"
-#     script:
-#         config["fig_crispr"]["script"]
--- a/workflow_report/rules/collect_data.smk
+++ b/workflow_report/rules/collect_data.smk
+##############################
+# Preprocessing
+
 rule collect_nanostats:
    input:
        os.path.join(RESULTS_DIR, "qc/metag/lr/NanoStats.txt")
@@ -9,35 +12,8 @@ rule collect_nanostats:
        summary = parse_nanostats_report(input[0])
        summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False)

-
-rule collect_quast:
-    input:
-        expand(
-            os.path.join(RESULTS_DIR, "analysis/quast/{rtype_tool}/report.tsv"),
-            rtype_tool=["%s/%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
-        )
-    output:
-        os.path.join(RESULTS_DIR, "analysis/quast/summary_report.tsv")
-    run:
-        import pandas
-        summary = None
-        for ifile in input:
-            ifile_rtype = os.path.basename(os.path.dirname(os.path.dirname(ifile)))
-            ifile_tool  = os.path.basename(os.path.dirname(ifile))
-            ifile_df    = pandas.read_csv(ifile, sep="\t", header=0, index_col=0)
-            ifile_df.rename(columns={"ASSEMBLY": ifile_tool}, inplace=True)
-            print(ifile_df)
-            if summary is None:
-                summary = ifile_df.copy()
-            else:
-                summary = summary.merge(
-                    right=ifile_df,
-                    how="outer",
-                    left_index=True,
-                    right_index=True,
-                    sort=False
-                )
-        summary.to_csv(output[0], sep="\t", header=True, index=True, index_label=False)
+##############################
+# Annotation

 rule collect_plasflow:
    input:
@@ -144,4 +120,71 @@ rule collect_minced:
        summary = pandas.concat(objs=summary, axis="index")
        summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False)

-# TODO: https://git-r3lab.uni.lu/susheel.busi/ont_pilot_gitlab/-/issues/23
\ No newline at end of file
+##############################
+# Analysis
+
+rule collect_quast:
+    input:
+        expand(
+            os.path.join(RESULTS_DIR, "analysis/quast/{rtype_tool}/report.tsv"),
+            rtype_tool=["%s/%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
+        )
+    output:
+        os.path.join(RESULTS_DIR, "analysis/quast/summary_report.tsv")
+    run:
+        import pandas
+        summary = None
+        for ifile in input:
+            ifile_rtype = os.path.basename(os.path.dirname(os.path.dirname(ifile)))
+            ifile_tool  = os.path.basename(os.path.dirname(ifile))
+            ifile_df    = pandas.read_csv(ifile, sep="\t", header=0, index_col=0)
+            ifile_df.rename(columns={"ASSEMBLY": ifile_tool}, inplace=True)
+            print(ifile_df)
+            if summary is None:
+                summary = ifile_df.copy()
+            else:
+                summary = summary.merge(
+                    right=ifile_df,
+                    how="outer",
+                    left_index=True,
+                    right_index=True,
+                    sort=False
+                )
+        summary.to_csv(output[0], sep="\t", header=True, index=True, index_label=False)
+
+# TODO: https://git-r3lab.uni.lu/susheel.busi/ont_pilot_gitlab/-/issues/23
+rule collect_samtools_flagstat:
+    input:
+        metag=expand(
+            os.path.join(RESULTS_DIR, "mapping/metag/{rtype_tool}.flagstat.txt"),
+            rtype_tool=["{r}/{t}/ASSEMBLY_{r}".format(r=r, t=t) for r, t in READ_ASSEMBLERS]
+        ),
+        metat=expand(
+            os.path.join(RESULTS_DIR, "mapping/metat/{rtype_tool}.flagstat.txt"),
+            rtype_tool=["{r}/{t}/ASSEMBLY_sr".format(r=r, t=t) for r, t in READ_ASSEMBLERS]
+        ) if "metat" in META_TYPES else []
+    output:
+        metag=os.path.join(RESULTS_DIR, "mapping/metag/mappability.tsv"),
+        metat=os.path.join(RESULTS_DIR, "mapping/metat/mappability.tsv")
+    run:
+        import pandas
+        from scripts.utils import parse_samtools_flagstat_report
+        
+        def collect(ifiles):
+            summary = {}
+            for ifile in ifiles:
+                ifile_rtype = os.path.basename(os.path.dirname(os.path.dirname(ifile)))
+                ifile_tool  = os.path.basename(os.path.dirname(ifile))
+                summary[ifile_tool] = parse_samtools_flagstat_report(ifile)
+            if len(summary) > 0:
+                return pandas.DataFrame.from_dict(summary)
+            return None
+        
+        metag_summary = collect(input.metag)
+        metat_summary = collect(input.metat)
+        metag_summary.to_csv(output.metag, sep="\t", header=True, index=True, index_label=False)
+        if metat_summary is not None:
+            metat_summary.to_csv(output.metat, sep="\t", header=True, index=True, index_label=False)
+        else:
+            from pathlib import Path
+            Path(output.metat).touch()
--- a/workflow_report/scripts/report.R
+++ b/workflow_report/scripts/report.R
@@ -8,6 +8,12 @@
 # NanoStats
 TABS$nanostats <- read_nanostats(snakemake@input$nanostats)

+##############################
+# Mappability
+
+TABS$mappability_metag <- read_mappability(snakemake@input$mappability_metag)
+TABS$mappability_metat <- read_mappability(snakemake@input$mappability_metat)
+
 ##############################
 # QAUST: assembly quality


--- a/workflow_report/scripts/report.Rmd
+++ b/workflow_report/scripts/report.Rmd
@@ -26,6 +26,16 @@ knitr::kable(TABS$nanostats, caption='NanoStats')

 # Assembly

+## Mappability
+
+```{r tables-mappability-metag, echo=FALSE}
+knitr::kable(TABS$mappability_metag, caption='MetaG')
+```
+
+```{r tables-mappability-metat, echo=FALSE}
+knitr::kable(TABS$mappability_metat, caption='MetaT')
+```
+
 ## Assembly quality

 ```{r tables-quast, echo=FALSE}

--- a/workflow_report/scripts/utils.R
+++ b/workflow_report/scripts/utils.R
@@ -77,6 +77,20 @@ read_crispr <- function(fname){
    return(df)
 }

+read_mappability <- function(fname){
+    df <- read.csv(
+        file=fname,
+        sep="\t",
+        header=TRUE,
+        row.names=1,
+        check.names=FALSE,
+        stringsAsFactors=FALSE
+    )
+    testit::assert(all(colnames(df) %in% names(ASM_TOOL_NAMES)))
+    colnames(df) <- ASM_TOOL_NAMES[colnames(df)]
+    return(df)
+}
+
 ##############################
 # PLOTS