Skip to content
Snippets Groups Projects
Commit 629dc3fc authored by Valentina Galata's avatar Valentina Galata
Browse files

updated rule for nanostats summary (issue #18); rule for crispr summary (issue #19)

parent bc7c5db7
No related branches found
No related tags found
2 merge requests!71Master,!68Figures valentina
This commit is part of merge request !68. Comments created here will be created in the context of that merge request.
This diff is collapsed.
......@@ -24,7 +24,8 @@ rule all:
FIG_MMSEQ_UPSETR,
FIG_QUAST,
FIG_PARTIAL_GENES,
FIG_NANOSTATS
FIG_NANOSTATS,
"data/crispr_summary.tsv"
rule fig_mmseq_upsetr:
input:
......@@ -76,7 +77,12 @@ rule fig_partial_genes:
rule fig_nanostats_data:
input:
config["fig_nanostats_data"]["input"]
# config["fig_nanostats_data"]["input"]
expand(
"/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/{meth_flag}/qc/lr/{sel_flag}/no_barcode/no_barcodeNanoStats.txt",
meth_flag=["results", "non_methylation_aware_results"],
sel_flag=["S1_SizeSelected", "S3_Gtube", "merged"]
)
output:
config["fig_nanostats"]["input"]["stats"]
run:
......@@ -118,7 +124,7 @@ rule fig_nanostats_data:
else:
continue
# dict to DataFrame
summary = pandas. DataFrame.from_dict(summary, orient="index")
summary = pandas.DataFrame.from_dict(summary, orient="index")
# wrote to file
summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False)
......@@ -136,4 +142,61 @@ rule fig_nanostats:
conda:
"envs/r.yml"
script:
config["fig_nanostats"]["script"]
\ No newline at end of file
config["fig_nanostats"]["script"]
rule fig_crispr_data:
input:
expand(
"/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/analysis/crispr/minced/{asm_tool}.txt",
asm_tool=["flye", "megahit", "metaspades", "metaspades_hybrid"]
),
expand(
"/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/analysis/crispr/casc/{asm_tool}_casc_output/{asm_tool}.results.txt",
asm_tool=["flye", "megahit", "metaspades", "metaspades_hybrid"]
)
output:
config["fig_crispr"]["input"]["stats"]
run:
import os
import re
import pandas
df_cols = ["seq_id", "array_start", "array_stop"]
df_cols_ = ["#seq_id", "array_start", "array_stop"]
summary = []
print(input)
for ifile_path in input:
with open(ifile_path, "r") as ifile:
ifile_df = None
# info from file path
crispr_tool, asm_tool = None, None
if re.search("minced", ifile_path):
crispr_tool = "minced"
elif re.search("casc", ifile_path):
crispr_tool = "casc"
asm_tool = os.path.basename(ifile_path).split(".")[0]
# file content
# minced
if crispr_tool == "minced":
ifile_df = []
for line in ifile:
if re.match("Sequence ", line):
line_re = re.search(re.compile("^Sequence '(?P<seq_id>[\w\.]+)' \((?P<seq_len>\d+) bp\)$"), line)
assert line_re, "Could not extract info from \"{}\"".format(line)
ifile_df.append(dict.fromkeys(df_cols))
ifile_df[len(ifile_df)-1]["seq_id"] = line_re.group("seq_id")
elif re.match("CRISPR ", line):
line_re = re.search(re.compile("^CRISPR (?P<crispr_id>\d+)\s+Range: (?P<array_start>\d+) - (?P<array_stop>\d+)$"), line)
assert line_re, "Could not extract info from \"{}\"".format(line)
ifile_df[len(ifile_df)-1]["array_start"] = int(line_re.group("array_start"))
ifile_df[len(ifile_df)-1]["array_stop"] = int(line_re.group("array_stop"))
ifile_df = pandas.DataFrame(ifile_df)
elif crispr_tool == "casc":
ifile_df = pandas.read_csv(ifile, sep="\t", header=0, usecols=df_cols_, comment=None)
ifile_df.rename(columns={"#seq_id": "seq_id"}, inplace=True)
ifile_df = ifile_df.assign(crispr_tool=crispr_tool)
ifile_df = ifile_df.assign(asm_tool=asm_tool)
summary.append(ifile_df)
# concat
summary = pandas.concat(objs=summary, axis="index")
# wrote to file
summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False)
......@@ -27,19 +27,18 @@ fig_partial_genes:
width: 7
height: 5
fig_nanostats_data:
input:
- "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/qc/lr/S1_SizeSelected/no_barcode/no_barcodeNanoStats.txt"
- "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/qc/lr/S3_Gtube/no_barcode/no_barcodeNanoStats.txt"
- "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/qc/lr/merged/no_barcode/no_barcodeNanoStats.txt"
- "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/non_methylation_aware_results/qc/lr/S1_SizeSelected/no_barcode/no_barcodeNanoStats.txt"
- "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/non_methylation_aware_results/qc/lr/S3_Gtube/no_barcode/no_barcodeNanoStats.txt"
- "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/non_methylation_aware_results/qc/lr/merged/no_barcode/no_barcodeNanoStats.txt"
fig_nanostats:
script: "src/fig_nanostats.R"
input:
stats: "data/nanostats_summary.tsv"
output: "fig_nanostats.pdf"
width: 7
height: 9
fig_crispr:
script: "src/fig_crispr.R"
input:
stats: "data/crispr_summary.tsv"
output: "fig_crispr.pdf"
width: 7
height: 9
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment