Skip to content
Snippets Groups Projects
Commit 75c540bc authored by Valentina Galata's avatar Valentina Galata
Browse files

report: added mappability (flagstat) (issue #27, issue #23)

parent 1fdd1f1e
No related branches found
No related tags found
1 merge request!76Merge "cleanup" branch with "master" branch
......@@ -209,4 +209,20 @@ def merge_rgi_reports(dfs, summary_col, summary_type):
)
summary["col"] = summary_col
summary["type"] = summary_type
return summary
def parse_samtools_flagstat_report(ifile_path):
import re
summary = {}
with open(ifile_path, "r") as ifile:
for line in ifile:
if re.search("^\d+ \+ \d+ in total \(", line):
line_re = re.search(re.compile("^(?P<passed>\d+) \+ (?P<failed>\d+) .*$"), line)
assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path)
summary["total"] = int(line_re.group("passed"))
elif re.search("^\d+ \+ \d+ mapped \(", line):
line_re = re.search(re.compile("^(?P<passed>\d+) \+ (?P<failed>\d+) .*$"), line)
assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path)
summary["mapped"] = int(line_re.group("passed"))
summary["mapped_pct"] = 100 * summary["mapped"] / summary["total"]
return summary
\ No newline at end of file
# snakemake -s report/Snakefile --configfile config/GDB/config.yaml --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline --cores 1 -rpn
# snakemake -s workflow_report/Snakefile --configfile config/GDB/config.yaml --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline --cores 1 -rpn
# References:
# rmarkdown: https://bookdown.org/yihui/rmarkdown/
......@@ -31,8 +31,15 @@ workdir:
RESULTS_DIR = config["results_dir"]
DB_DIR = config["db_dir"]
# Input
# INPUT_G_FAST5 = find_fast5(config["data"]["metag"]["ont"]["files"], config["data"]["metag"]["ont"]["dirs"]) # TODO: consider a different approach
INPUT_G_SR = list(config["data"]["metag"]["sr"].values())
INPUT_T_SR = []
if config["data"]["metat"]["sr"]["r1"] and config["data"]["metat"]["sr"]["r2"]:
INPUT_T_SR = list(config["data"]["metat"]["sr"].values())
# Assemblers and read types
# META_TYPES = ["metag", "metat"] if INPUT_T_SR else ["metag"]
META_TYPES = ["metag", "metat"] if INPUT_T_SR else ["metag"]
READ_TYPES = list(config["assemblers"].keys()) # list of read types
ASSEMBLERS = [y for x in config["assemblers"].values() for y in x] # list of all assemblers
READ_ASSEMBLERS = [y for x in [[(k, vv) for vv in v] for k, v in config["assemblers"].items()] for y in x] # list of (read type, assembler)
......@@ -78,7 +85,9 @@ rule render_report:
plasflow=os.path.join(RESULTS_DIR, "annotation/plasflow/summary.tsv"),
rgi=os.path.join(RESULTS_DIR, "annotation/rgi/summary.tsv"),
casc=os.path.join(RESULTS_DIR, "annotation/casc/summary.tsv"),
minced=os.path.join(RESULTS_DIR, "annotation/minced/summary.tsv")
minced=os.path.join(RESULTS_DIR, "annotation/minced/summary.tsv"),
mappability_metag=os.path.join(RESULTS_DIR, "mapping/metag/mappability.tsv"),
mappability_metat=os.path.join(RESULTS_DIR, "mapping/metat/mappability.tsv"),
output:
html=os.path.join(RESULTS_DIR, "report/report.html"),
pdf_quast=os.path.join(RESULTS_DIR, "report/fig_quast.pdf"),
......@@ -95,91 +104,3 @@ rule render_report:
os.path.join(ENV_DIR, "r.yaml")
script:
os.path.join(SRC_DIR, "report_render.R")
# rule fig_mmseq_upsetr:
# input:
# overlap_sizes=config["fig_mmseq_upsetr"]["input"]["overlap_sizes"]
# output:
# pdf=FIG_MMSEQ_UPSETR
# log:
# FIG_MMSEQ_UPSETR + ".log"
# params:
# utils=config["utils"],
# width=config["fig_mmseq_upsetr"]["width"],
# height=config["fig_mmseq_upsetr"]["height"]
# conda:
# "envs/r.yml"
# script:
# config["fig_mmseq_upsetr"]["script"]
# rule fig_partial_genes:
# input:
# counts=config["fig_partial_genes"]["input"]["counts"]
# output:
# pdf=FIG_PARTIAL_GENES
# log:
# FIG_PARTIAL_GENES + ".log"
# params:
# utils=config["utils"],
# width=config["fig_partial_genes"]["width"],
# height=config["fig_partial_genes"]["height"]
# conda:
# "envs/r.yml"
# script:
# config["fig_partial_genes"]["script"]
# rule fig_crispr_data:
# input:
# expand(
# os.path.join(config["data_path"], "results/analysis/crispr/minced/{asm_tool}.txt"),
# asm_tool=config["assemblers"]
# ),
# expand(
# os.path.join(config["data_path"], "results/analysis/crispr/casc/{asm_tool}_casc_output/{asm_tool}.results.txt"),
# asm_tool=config["assemblers"]
# )
# output:
# config["fig_crispr"]["input"]["stats"]
# run:
# import os
# import re
# import pandas
# from src.utils import parse_minced_report, parse_casc_report
# summary = []
# for ifile_path in input:
# ifile_df = None
# # info from file path
# crispr_tool = None
# if re.search("minced", ifile_path):
# crispr_tool = "minced"
# elif re.search("casc", ifile_path):
# crispr_tool = "casc"
# asm_tool = os.path.basename(ifile_path).split(".")[0]
# # file content
# if crispr_tool == "minced":
# ifile_df = pandas.DataFrame(parse_minced_report(ifile_path))
# elif crispr_tool == "casc":
# ifile_df = parse_casc_report(ifile_path)
# ifile_df = ifile_df.assign(crispr_tool=crispr_tool)
# ifile_df = ifile_df.assign(asm_tool=asm_tool)
# summary.append(ifile_df)
# # concat
# summary = pandas.concat(objs=summary, axis="index")
# # wrote to file
# summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False)
# rule fig_crispr:
# input:
# stats=config["fig_crispr"]["input"]["stats"]
# output:
# pdf=FIG_CRISPR
# log:
# FIG_CRISPR + ".log"
# params:
# utils=config["utils"],
# width=config["fig_crispr"]["width"],
# height=config["fig_crispr"]["height"]
# conda:
# "envs/r.yml"
# script:
# config["fig_crispr"]["script"]
##############################
# Preprocessing
rule collect_nanostats:
input:
os.path.join(RESULTS_DIR, "qc/metag/lr/NanoStats.txt")
......@@ -9,35 +12,8 @@ rule collect_nanostats:
summary = parse_nanostats_report(input[0])
summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False)
rule collect_quast:
input:
expand(
os.path.join(RESULTS_DIR, "analysis/quast/{rtype_tool}/report.tsv"),
rtype_tool=["%s/%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
)
output:
os.path.join(RESULTS_DIR, "analysis/quast/summary_report.tsv")
run:
import pandas
summary = None
for ifile in input:
ifile_rtype = os.path.basename(os.path.dirname(os.path.dirname(ifile)))
ifile_tool = os.path.basename(os.path.dirname(ifile))
ifile_df = pandas.read_csv(ifile, sep="\t", header=0, index_col=0)
ifile_df.rename(columns={"ASSEMBLY": ifile_tool}, inplace=True)
print(ifile_df)
if summary is None:
summary = ifile_df.copy()
else:
summary = summary.merge(
right=ifile_df,
how="outer",
left_index=True,
right_index=True,
sort=False
)
summary.to_csv(output[0], sep="\t", header=True, index=True, index_label=False)
##############################
# Annotation
rule collect_plasflow:
input:
......@@ -144,4 +120,71 @@ rule collect_minced:
summary = pandas.concat(objs=summary, axis="index")
summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False)
# TODO: https://git-r3lab.uni.lu/susheel.busi/ont_pilot_gitlab/-/issues/23
\ No newline at end of file
##############################
# Analysis
rule collect_quast:
input:
expand(
os.path.join(RESULTS_DIR, "analysis/quast/{rtype_tool}/report.tsv"),
rtype_tool=["%s/%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
)
output:
os.path.join(RESULTS_DIR, "analysis/quast/summary_report.tsv")
run:
import pandas
summary = None
for ifile in input:
ifile_rtype = os.path.basename(os.path.dirname(os.path.dirname(ifile)))
ifile_tool = os.path.basename(os.path.dirname(ifile))
ifile_df = pandas.read_csv(ifile, sep="\t", header=0, index_col=0)
ifile_df.rename(columns={"ASSEMBLY": ifile_tool}, inplace=True)
print(ifile_df)
if summary is None:
summary = ifile_df.copy()
else:
summary = summary.merge(
right=ifile_df,
how="outer",
left_index=True,
right_index=True,
sort=False
)
summary.to_csv(output[0], sep="\t", header=True, index=True, index_label=False)
# TODO: https://git-r3lab.uni.lu/susheel.busi/ont_pilot_gitlab/-/issues/23
rule collect_samtools_flagstat:
input:
metag=expand(
os.path.join(RESULTS_DIR, "mapping/metag/{rtype_tool}.flagstat.txt"),
rtype_tool=["{r}/{t}/ASSEMBLY_{r}".format(r=r, t=t) for r, t in READ_ASSEMBLERS]
),
metat=expand(
os.path.join(RESULTS_DIR, "mapping/metat/{rtype_tool}.flagstat.txt"),
rtype_tool=["{r}/{t}/ASSEMBLY_sr".format(r=r, t=t) for r, t in READ_ASSEMBLERS]
) if "metat" in META_TYPES else []
output:
metag=os.path.join(RESULTS_DIR, "mapping/metag/mappability.tsv"),
metat=os.path.join(RESULTS_DIR, "mapping/metat/mappability.tsv")
run:
import pandas
from scripts.utils import parse_samtools_flagstat_report
def collect(ifiles):
summary = {}
for ifile in ifiles:
ifile_rtype = os.path.basename(os.path.dirname(os.path.dirname(ifile)))
ifile_tool = os.path.basename(os.path.dirname(ifile))
summary[ifile_tool] = parse_samtools_flagstat_report(ifile)
if len(summary) > 0:
return pandas.DataFrame.from_dict(summary)
return None
metag_summary = collect(input.metag)
metat_summary = collect(input.metat)
metag_summary.to_csv(output.metag, sep="\t", header=True, index=True, index_label=False)
if metat_summary is not None:
metat_summary.to_csv(output.metat, sep="\t", header=True, index=True, index_label=False)
else:
from pathlib import Path
Path(output.metat).touch()
......@@ -8,6 +8,12 @@
# NanoStats
TABS$nanostats <- read_nanostats(snakemake@input$nanostats)
##############################
# Mappability
TABS$mappability_metag <- read_mappability(snakemake@input$mappability_metag)
TABS$mappability_metat <- read_mappability(snakemake@input$mappability_metat)
##############################
# QAUST: assembly quality
......
......@@ -26,6 +26,16 @@ knitr::kable(TABS$nanostats, caption='NanoStats')
# Assembly
## Mappability
```{r tables-mappability-metag, echo=FALSE}
knitr::kable(TABS$mappability_metag, caption='MetaG')
```
```{r tables-mappability-metat, echo=FALSE}
knitr::kable(TABS$mappability_metat, caption='MetaT')
```
## Assembly quality
```{r tables-quast, echo=FALSE}
......
......@@ -77,6 +77,20 @@ read_crispr <- function(fname){
return(df)
}
read_mappability <- function(fname){
df <- read.csv(
file=fname,
sep="\t",
header=TRUE,
row.names=1,
check.names=FALSE,
stringsAsFactors=FALSE
)
testit::assert(all(colnames(df) %in% names(ASM_TOOL_NAMES)))
colnames(df) <- ASM_TOOL_NAMES[colnames(df)]
return(df)
}
##############################
# PLOTS
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment