Merge branch 'dev' of ssh://git-r3lab-server.uni.lu:8022/shaman.narayanasamy/IMP into dev

a5d13a05 · Shaman Narayanasamy · 73b09875 · 8577a035 · a5d13a05
Commit a5d13a05 authored 9 years ago by Shaman Narayanasamy
--- a/src/genes.to.kronaTable.py
+++ b/src/genes.to.kronaTable.py
@@ -9,52 +9,69 @@

 #Last Modified : Sun 23 Nov 2014 09:47:10 AM CET

-#Created By :  
+#Created By :

 ###################################

 from argparse import ArgumentParser
 parser = ArgumentParser()
-parser.add_argument("-i", "--infile", type=str,
-        help="Tab-delimited file with gene_ids in first column and gene_annotations in second column")
-parser.add_argument("-m", "--mapfile", type=str,
-        help="Tab-delimited file mapping each gene_annotation to the nearest parent hierarchy level (e.g. pathway to enzyme)")
-parser.add_argument("-H", "--hierarchy", type=str,
-        help="Hierarchy file for parent levels")
-parser.add_argument("-l", "--limit", type=str,
-        help="Limit calculations to only this list of parent hierarchies. E.g. a list of predicted pathways only")
-parser.add_argument("-O", "--operation", type=str, default="mean",
-        help="Specify how to do calculations, either 'mean' (default), 'sum' or 'meanhalf' where averages will be calculated from the most abundant half of annotations")
-parser.add_argument("-o", "--outfile", type=str,
-        help="Write results to outfile. Defaults to stdout")
-parser.add_argument("-n", "--name", type=str,
-        help="OPTIONAL: Name to assign to outfile. Defaults to name of infile")
-parser.add_argument("-c", "--coverage", type=str,
-        help="OPTIONAL: Supply a file of coverage for each gene in the infile")
-parser.add_argument("-v", "--verbose", action="store_true",
-        help="Run in verbose mode")
-parser.add_argument("-s", "--singlecol", action="store_true",
-        help="Write only one annotation column for the first parent hierarchy (e.g. pathway)")
-parser.add_argument("-d", "--sdev", type=str,
-        help="Write the standard deviation of each first parent hierarchy level to this file")
-parser.add_argument("-L", "--lengthnorm", type=str, 
-        help="Provide file with lengths for genes to normalize coverage by")
+parser.add_argument(
+    "-i", "--infile", type=str,
+    help="Tab-delimited file with gene_ids in first column and gene_annotations in second column")
+parser.add_argument(
+    "-m", "--mapfile", type=str,
+    help="Tab-delimited file mapping each gene_annotation to the nearest parent hierarchy level (e.g. pathway to enzyme)")
+parser.add_argument(
+    "-H", "--hierarchy", type=str,
+    help="Hierarchy file for parent levels")
+parser.add_argument(
+    "-l", "--limit", type=str,
+    help="Limit calculations to only this list of parent hierarchies. E.g. a list of predicted pathways only")
+parser.add_argument(
+    "-O", "--operation", type=str, default="mean",
+    help="Specify how to do calculations, either 'mean' (default), 'sum' or 'meanhalf' where averages will be calculated from the most abundant half of annotations")
+parser.add_argument(
+    "-o", "--outfile", type=str,
+    help="Write results to outfile. Defaults to stdout")
+parser.add_argument(
+    "-n", "--name", type=str,
+    help="OPTIONAL: Name to assign to outfile. Defaults to name of infile")
+parser.add_argument(
+    "-c", "--coverage", type=str,
+    help="OPTIONAL: Supply a file of coverage for each gene in the infile")
+parser.add_argument(
+    "-v", "--verbose", action="store_true",
+    help="Run in verbose mode")
+parser.add_argument(
+    "-s", "--singlecol", action="store_true",
+    help="Write only one annotation column for the first parent hierarchy (e.g. pathway)")
+parser.add_argument(
+    "-d", "--sdev", type=str,
+    help="Write the standard deviation of each first parent hierarchy level to this file")
+parser.add_argument(
+    "-L", "--lengthnorm", type=str,
+    help="Provide file with lengths for genes to normalize coverage by")
 args = parser.parse_args()

-import sys,csv, numpy as np
+import sys
+import csv
+import numpy as np
+
+if not args.infile or not args.mapfile or not args.hierarchy:
+    sys.exit(parser.print_help())

-if not args.infile or not args.mapfile or not args.hierarchy: sys.exit(parser.print_help())

 def ReadLengths(f):
    hin = open(f)
    d = {}
    for line in hin:
        line = line.rstrip()
-        [g,l] = line.rsplit()
+        [g, l] = line.rsplit()
        l = float(l)
        d[g] = l
    return d

+
 def ReadLimits(f):
    limit = {}
    hin = open(f)
@@ -64,135 +81,180 @@ def ReadLimits(f):
    hin.close()
    return limit

+
 def ReadMap(f):
    d = {}
    hin = open(f)
-    hincsv = csv.reader(hin, delimiter = '\t')
-    for row in hincsv: 
+    hincsv = csv.reader(hin, delimiter='\t')
+    for row in hincsv:
        ann = row[1]
        parent = row[0]
-        try: d[ann].append(parent)
-        except KeyError: d[ann] = [parent]
+        try:
+            d[ann].append(parent)
+        except KeyError:
+            d[ann] = [parent]
    hin.close()
    return d

+
 def ReadCoverage(f, lengths):
    d = {}
-    try: hin = open(f, 'r')
-    except TypeError: return {}
-    hincsv = csv.reader(hin, delimiter = '\t')
-    for row in hincsv: 
+    try:
+        hin = open(f, 'r')
+    except TypeError:
+        return {}
+    hincsv = csv.reader(hin, delimiter='\t')
+    for row in hincsv:
        g = row[0]
-        try: c = float(row[1])
-        except ValueError: continue
-        if len(lengths.keys())>0:
-            try: l = lengths[g]
-            except KeyError: sys.exit("No length found for gene "+g+"\n")
-        else: l = 1
+        try:
+            c = float(row[1])
+        except ValueError:
+            continue
+        if len(lengths.keys()) > 0:
+            try:
+                l = lengths[g]
+            except KeyError:
+                sys.exit("No length found for gene "+g+"\n")
+        else:
+            l = 1
        c = c/l
        d[g] = c
    hin.close()
    return d

+
 def ReadHierarchy(f):
    d = {}
    hin = open(f)
-    hincsv = csv.reader(hin, delimiter = '\t')
+    hincsv = csv.reader(hin, delimiter='\t')
    l = []
    for row in hincsv:
        id = row[0]
        name = row[1]
-        try: hiers = row[2]
-        except IndexError: hiers = "Unknown"
+        try:
+            hiers = row[2]
+        except IndexError:
+            hiers = "Unknown"
        d[id] = hiers.split("|")+[name]
        l.append(len(d[id]))
    hin.close()
-    return (max(l),d)
+    return (max(l), d)
+

 def Calculate(hier_c, operation):
    hier_sdev = {}
-    if operation == "sum": function = np.sum
-    elif operation == "mean" or operation == "meanhalf": function = np.mean
-    for hier, l in hier_c.iteritems(): 
+    if operation == "sum":
+        function = np.sum
+    elif operation == "mean" or operation == "meanhalf":
+        function = np.mean
+    for hier, l in hier_c.iteritems():
        hier_sdev[hier] = 0.0
        if operation == "meanhalf":
            l.sort()
            l = l[len(l)/2:]
        hier_c[hier] = function(l)
        hier_sdev[hier] = np.std(l)
-    return (hier_sdev,hier_c)
+    return (hier_sdev, hier_c)
+

 def CalcHierarchy(mapping, annotations, coverage, operation, limit):
-    ## Iterate annotations, and sum coverage if available
+    # # Iterate annotations, and sum coverage if available
    ann_c = {}
-    if len(coverage.keys()) > 0: cov = True
-    else: cov = False
+    if len(coverage.keys()) > 0:
+        cov = True
+    else:
+        cov = False
    for annotation, l in annotations.iteritems():
        covlist = []
        if cov:
            for gene in l:
-                try: gene_cov = coverage[gene]
-                except KeyError: sys.exit("ERROR: Could not find coverage for gene "+str(gene)+". Are you sure you have coverage information?\n")
+                try:
+                    gene_cov = coverage[gene]
+                except KeyError:
+                    sys.exit("ERROR: Could not find coverage for gene "+str(gene)+". Are you sure you have coverage information?\n")
                covlist.append(gene_cov)
            ann_c[annotation] = np.mean(covlist)
-        else: ann_c[annotation] = len(l)
-    ## Transfer annotation sums to nearest parent in mapping, if limit is supplied skip parents in limit
+        else:
+            ann_c[annotation] = len(l)
+    # # Transfer annotation sums to nearest parent in mapping, if limit is supplied skip parents in limit
    hier_c = {}
    for annotation, count in ann_c.iteritems():
-        
-        try: parents = mapping[annotation]
-        except KeyError: 
-            if args.verbose: sys.stderr.write("WARNING: Could not find hierarchy parent for "+annotation+"\n")
+
+        try:
+            parents = mapping[annotation]
+        except KeyError:
+            if args.verbose:
+                sys.stderr.write("WARNING: Could not find hierarchy parent for "+annotation+"\n")
            continue
-        for parent in parents: 
-            if limit and not parent in limit: 
-                if args.verbose: sys.stderr.write("Skipping parent "+ parent+"\n")
+        for parent in parents:
+            if limit and parent not in limit:
+                if args.verbose:
+                    sys.stderr.write("Skipping parent "+ parent+"\n")
                continue
-            try: hier_c[parent].append(count)
-            except KeyError: hier_c[parent] = [count]
-    (hier_sdev,hier_c) = Calculate(hier_c, operation)
-    return (hier_sdev,hier_c)
+            try:
+                hier_c[parent].append(count)
+            except KeyError:
+                hier_c[parent] = [count]
+    (hier_sdev, hier_c) = Calculate(hier_c, operation)
+    return (hier_sdev, hier_c)

-if args.limit: limit = ReadLimits(args.limit)
-else: limit = []
+if args.limit:
+    limit = ReadLimits(args.limit)
+else:
+    limit = []

 lengths = {}
-if args.lengthnorm: lengths = ReadLengths(args.lengthnorm)
+if args.lengthnorm:
+    lengths = ReadLengths(args.lengthnorm)

 mapping = ReadMap(args.mapfile)
-annotations = ReadMap(args.infile) ## Read annotations the same way as above, then get length of the list for counts
-coverage = ReadCoverage(args.coverage, lengths) 
+annotations = ReadMap(args.infile)  # # Read annotations the same way as above, then get length of the list for counts
+coverage = ReadCoverage(args.coverage, lengths)
 (max_hier, hierarchy) = ReadHierarchy(args.hierarchy)

 (hier_sdev, hier_counts) = CalcHierarchy(mapping, annotations, coverage, args.operation, limit)

-if args.outfile: hout = open(args.outfile, 'w')
-else: hout = sys.stdout
-houtcsv = csv.writer(hout, delimiter = '\t')
+if args.outfile:
+    hout = open(args.outfile, 'w')
+else:
+    hout = sys.stdout
+houtcsv = csv.writer(hout, delimiter='\t')

-if args.name: name = args.name
-else: name = (args.infile).split("/")[-1]
+if args.name:
+    name = args.name
+else:
+    name = (args.infile).split("/")[-1]

 out = [name]
-if args.singlecol:out.insert(0,"X")
+if args.singlecol:
+    out.insert(0, "X")
 else:
-    for i in range(1,max_hier+1): out.append("Level"+str(i))
+    for i in range(1, max_hier+1):
+        out.append("Level" + str(i))
 houtcsv.writerow(out)

-if args.sdev: 
+if args.sdev:
    sdevout = open(args.sdev, 'w')
    sdevout.write("X.sdev\t"+name+"\n")

-for hier,count in hier_counts.iteritems():
+for hier, count in hier_counts.iteritems():
    out = [count]
-    try: h = hierarchy[hier]
-    except KeyError: h = ["Unknown"]
-    if args.singlecol: out.insert(0,hier)
-    else: out+=hierarchy[hier]
-    try: sdevout.write(hier+"\t"+str(hier_sdev[hier])+"\n")
-    except NameError: pass
+    try:
+        h = hierarchy[hier]
+    except KeyError:
+        h = ["Unknown"]
+    if args.singlecol:
+        out.insert(0, hier)
+    else:
+        out += hierarchy[hier]
+    try:
+        sdevout.write(hier+"\t"+str(hier_sdev[hier])+"\n")
+    except NameError:
+        pass
    houtcsv.writerow(out)
 hout.close()

-try: sdevout.close()
-except NameError: pass
+try:
+    sdevout.close()
+except NameError:
+    pass