Skip to content
Snippets Groups Projects
Commit 1990f776 authored by Valentina Galata's avatar Valentina Galata
Browse files

updated rule to collect CRISPR data (issue #19); added figures/src/utils.py

parent 845ab336
No related branches found
No related tags found
2 merge requests!71Master,!70Add latest figure updates
......@@ -164,42 +164,27 @@ rule fig_crispr_data:
import os
import re
import pandas
df_cols = ["seq_id", "array_start", "array_stop"]
df_cols_ = ["#seq_id", "array_start", "array_stop"]
from src.utils import parse_minced_report
summary = []
print(input)
for ifile_path in input:
with open(ifile_path, "r") as ifile:
ifile_df = None
# info from file path
crispr_tool, asm_tool = None, None
if re.search("minced", ifile_path):
crispr_tool = "minced"
elif re.search("casc", ifile_path):
crispr_tool = "casc"
asm_tool = os.path.basename(ifile_path).split(".")[0]
# file content
# minced
if crispr_tool == "minced":
ifile_df = []
for line in ifile:
if re.match("Sequence ", line):
line_re = re.search(re.compile("^Sequence '(?P<seq_id>[\w\.]+)' \((?P<seq_len>\d+) bp\)$"), line)
assert line_re, "Could not extract info from \"{}\"".format(line)
ifile_df.append(dict.fromkeys(df_cols))
ifile_df[len(ifile_df)-1]["seq_id"] = line_re.group("seq_id")
elif re.match("CRISPR ", line):
line_re = re.search(re.compile("^CRISPR (?P<crispr_id>\d+)\s+Range: (?P<array_start>\d+) - (?P<array_stop>\d+)$"), line)
assert line_re, "Could not extract info from \"{}\"".format(line)
ifile_df[len(ifile_df)-1]["array_start"] = int(line_re.group("array_start"))
ifile_df[len(ifile_df)-1]["array_stop"] = int(line_re.group("array_stop"))
ifile_df = pandas.DataFrame(ifile_df)
elif crispr_tool == "casc":
ifile_df = pandas.read_csv(ifile, sep="\t", header=0, usecols=df_cols_, comment=None)
ifile_df.rename(columns={"#seq_id": "seq_id"}, inplace=True)
ifile_df = ifile_df.assign(crispr_tool=crispr_tool)
ifile_df = ifile_df.assign(asm_tool=asm_tool)
summary.append(ifile_df)
ifile_df = None
# info from file path
crispr_tool = None
if re.search("minced", ifile_path):
crispr_tool = "minced"
elif re.search("casc", ifile_path):
crispr_tool = "casc"
asm_tool = os.path.basename(ifile_path).split(".")[0]
# file content
if crispr_tool == "minced":
ifile_df = parse_minced_report(ifile_path)
ifile_df = pandas.DataFrame(ifile_df)
elif crispr_tool == "casc":
ifile_df = pandas.read_csv(ifile_path, sep="\t", header=0, usecols=["#seq_id", "spacers", "array_start", "array_stop"], comment=None)
ifile_df.rename(columns={"#seq_id": "seq_id"}, inplace=True)
ifile_df = ifile_df.assign(crispr_tool=crispr_tool)
ifile_df = ifile_df.assign(asm_tool=asm_tool)
summary.append(ifile_df)
# concat
summary = pandas.concat(objs=summary, axis="index")
# wrote to file
......
#!/usr/bin/python
def parse_minced_report(ifile_path):
"""
Parse the MINCED report: for each sequence and found CRISPR array, extract
- array start and end
- number of spacers = number of unique spacer sequences
Entry (sequence/CRISPR array) format:
Sequence '<string>' (<int> bp)
CRISPR <int> Range: <int> - <int>
POSITION REPEAT SPACER
-------- ------------------------------------ ------------------------------
<int> <string: ACTG> <string: ACTG> [ <int>, <int> ]
<...>
<int> <string: ACTG>
-------- ------------------------------------ ------------------------------
Repeats: <int> Average Length: <int> Average Length: <int>
CRISPR <int> Range: <int> - <int>
<...>
Time to find repeats: <int> ms
"""
import re
summary = []
with open(ifile_path, "r") as ifile:
new_seq = 0
seq_id = None
for line in ifile:
if re.match("Sequence ", line):
assert new_seq == 0, "Unexpected parsing error in {} in {}".format(line, ifile_path)
line_re = re.search(re.compile("^Sequence '(?P<seq_id>[\w\.]+)' \((?P<seq_len>\d+) bp\)$"), line)
assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path)
seq_id = line_re.group("seq_id")
new_seq = 1
elif re.match("CRISPR ", line):
assert new_seq == 1, "No seq before {} in {}".format(line, ifile_path)
line_re = re.search(re.compile("^CRISPR (?P<crispr_id>\d+)\s+Range: (?P<array_start>\d+) - (?P<array_stop>\d+)$"), line)
assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path)
summary.append({
"seq_id": seq_id,
"array_start": int(line_re.group("array_start")),
"array_stop": int(line_re.group("array_stop")),
"spacers": set()
})
new_seq += 1
elif re.match("POSITION", line):
assert new_seq == 2, "No new seq/CRISPR before {} in {}".format(line, ifile_path)
new_seq += 1
elif re.match("[0-9]+\s+", line):
assert re.match("[0-9]+\s+[ACTG]+", line), "Unexpected repeat/spacer format in {} in {}".format(line, ifile_path)
assert new_seq == 3, "No new seq./CRISPR before {} in {}".format(line, ifile_path)
if re.match("[0-9]+\s+[ACTG]+\s+[ACTG]+\s+", line):
line_re = re.search(re.compile("^[0-9]+\s+(?P<repeat>[ACTG]+)\s+(?P<spacer>[ACTG]+)\s+.*$"), line)
assert line_re, "Could not extract repeat/spacer from \"{}\" in {}".format(line, ifile_path)
summary[len(summary)-1]["spacers"].add(line_re.group("spacer"))
elif re.match("Repeats:", line):
new_seq = 1
summary[len(summary)-1]["spacers"] = len(summary[len(summary)-1]["spacers"])
elif re.match("Time to find repeats", line):
assert new_seq == 1 and summary[len(summary)-1]["spacers"] > 0, "No new seq./CRISPR/spacers before {} in {}".format(line, ifile_path)
new_seq = 0
seq_id = None
return summary
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment