updated rule to collect CRISPR data (issue #19); added figures/src/utils.py

1990f776 · Valentina Galata · 845ab336 · 1990f776 · 1990f776
Commit 1990f776 authored 4 years ago by Valentina Galata
--- a/figures/figures.smk
+++ b/figures/figures.smk
@@ -164,42 +164,27 @@ rule fig_crispr_data:
        import os
        import re
        import pandas
-        df_cols  = ["seq_id", "array_start", "array_stop"]
-        df_cols_ = ["#seq_id", "array_start", "array_stop"]
+        from src.utils import parse_minced_report
        summary = []
-        print(input)
        for ifile_path in input:
-            with open(ifile_path, "r") as ifile:
-                ifile_df = None
-                # info from file path
-                crispr_tool, asm_tool = None, None
-                if re.search("minced", ifile_path):
-                    crispr_tool = "minced"
-                elif re.search("casc", ifile_path):
-                    crispr_tool = "casc"
-                asm_tool = os.path.basename(ifile_path).split(".")[0]
-                # file content
-                # minced
-                if crispr_tool == "minced":
-                    ifile_df = []
-                    for line in ifile:
-                        if re.match("Sequence ", line):
-                            line_re = re.search(re.compile("^Sequence '(?P<seq_id>[\w\.]+)' \((?P<seq_len>\d+) bp\)$"), line)
-                            assert line_re, "Could not extract info from \"{}\"".format(line)
-                            ifile_df.append(dict.fromkeys(df_cols))
-                            ifile_df[len(ifile_df)-1]["seq_id"] = line_re.group("seq_id")
-                        elif re.match("CRISPR ", line):
-                            line_re = re.search(re.compile("^CRISPR (?P<crispr_id>\d+)\s+Range: (?P<array_start>\d+) - (?P<array_stop>\d+)$"), line)
-                            assert line_re, "Could not extract info from \"{}\"".format(line)
-                            ifile_df[len(ifile_df)-1]["array_start"] = int(line_re.group("array_start"))
-                            ifile_df[len(ifile_df)-1]["array_stop"]  = int(line_re.group("array_stop"))
-                    ifile_df = pandas.DataFrame(ifile_df)
-                elif crispr_tool == "casc":
-                    ifile_df = pandas.read_csv(ifile, sep="\t", header=0, usecols=df_cols_, comment=None)
-                    ifile_df.rename(columns={"#seq_id": "seq_id"}, inplace=True)
-                ifile_df = ifile_df.assign(crispr_tool=crispr_tool)
-                ifile_df = ifile_df.assign(asm_tool=asm_tool)
-                summary.append(ifile_df)
+            ifile_df = None
+            # info from file path
+            crispr_tool = None
+            if re.search("minced", ifile_path):
+                crispr_tool = "minced"
+            elif re.search("casc", ifile_path):
+                crispr_tool = "casc"
+            asm_tool = os.path.basename(ifile_path).split(".")[0]
+            # file content
+            if crispr_tool == "minced":
+                ifile_df = parse_minced_report(ifile_path)
+                ifile_df = pandas.DataFrame(ifile_df)
+            elif crispr_tool == "casc":
+                ifile_df = pandas.read_csv(ifile_path, sep="\t", header=0, usecols=["#seq_id", "spacers", "array_start", "array_stop"], comment=None)
+                ifile_df.rename(columns={"#seq_id": "seq_id"}, inplace=True)
+            ifile_df = ifile_df.assign(crispr_tool=crispr_tool)
+            ifile_df = ifile_df.assign(asm_tool=asm_tool)
+            summary.append(ifile_df)
        # concat
        summary = pandas.concat(objs=summary, axis="index")
        # wrote to file

--- a/figures/src/utils.py
+++ b/figures/src/utils.py
+#!/usr/bin/python
+
+def parse_minced_report(ifile_path):
+    """
+    Parse the MINCED report: for each sequence and found CRISPR array, extract
+    - array start and end
+    - number of spacers = number of unique spacer sequences
+
+    Entry (sequence/CRISPR array) format:
+        Sequence '<string>' (<int> bp)
+
+        CRISPR <int>   Range: <int> - <int>
+        POSITION        REPEAT                                  SPACER
+        --------        ------------------------------------    ------------------------------
+        <int>           <string: ACTG>                          <string: ACTG>                 [ <int>, <int> ]
+        <...>
+        <int>           <string: ACTG>    
+        --------        ------------------------------------    ------------------------------
+        Repeats: <int>      Average Length: <int>              Average Length: <int>
+
+        CRISPR <int>   Range: <int> - <int>
+        <...>
+
+        Time to find repeats: <int> ms
+    """
+    import re
+    summary = []
+    with open(ifile_path, "r") as ifile:
+        new_seq = 0
+        seq_id = None
+        for line in ifile:
+            if re.match("Sequence ", line):
+                assert new_seq == 0, "Unexpected parsing error in {} in {}".format(line, ifile_path)
+                line_re = re.search(re.compile("^Sequence '(?P<seq_id>[\w\.]+)' \((?P<seq_len>\d+) bp\)$"), line)
+                assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path)
+                seq_id = line_re.group("seq_id")
+                new_seq = 1
+            elif re.match("CRISPR ", line):
+                assert new_seq == 1, "No seq before {} in {}".format(line, ifile_path)
+                line_re = re.search(re.compile("^CRISPR (?P<crispr_id>\d+)\s+Range: (?P<array_start>\d+) - (?P<array_stop>\d+)$"), line)
+                assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path)
+                summary.append({
+                    "seq_id": seq_id,
+                    "array_start": int(line_re.group("array_start")),
+                    "array_stop": int(line_re.group("array_stop")),
+                    "spacers": set()
+                })
+                new_seq += 1
+            elif re.match("POSITION", line):
+                assert new_seq == 2, "No new seq/CRISPR before {} in {}".format(line, ifile_path)
+                new_seq += 1
+            elif re.match("[0-9]+\s+", line):
+                assert re.match("[0-9]+\s+[ACTG]+", line), "Unexpected repeat/spacer format in {} in {}".format(line, ifile_path)
+                assert new_seq == 3, "No new seq./CRISPR before {} in {}".format(line, ifile_path)
+                if re.match("[0-9]+\s+[ACTG]+\s+[ACTG]+\s+", line):
+                    line_re = re.search(re.compile("^[0-9]+\s+(?P<repeat>[ACTG]+)\s+(?P<spacer>[ACTG]+)\s+.*$"), line)
+                    assert line_re, "Could not extract repeat/spacer from \"{}\" in {}".format(line, ifile_path)
+                    summary[len(summary)-1]["spacers"].add(line_re.group("spacer"))
+            elif re.match("Repeats:", line):
+                new_seq = 1
+                summary[len(summary)-1]["spacers"] = len(summary[len(summary)-1]["spacers"])
+            elif re.match("Time to find repeats", line):
+                assert new_seq == 1 and summary[len(summary)-1]["spacers"] > 0, "No new seq./CRISPR/spacers before {} in {}".format(line, ifile_path)
+                new_seq = 0
+                seq_id = None
+    return summary