diff --git a/src/genes.to.kronaTable.py b/src/genes.to.kronaTable.py index 2412df0bf7ca268de1a3b2637aea24872799010c..147dfe7b7810f49d7ab0c3fa3d1267ce5428dfec 100755 --- a/src/genes.to.kronaTable.py +++ b/src/genes.to.kronaTable.py @@ -9,52 +9,69 @@ #Last Modified : Sun 23 Nov 2014 09:47:10 AM CET -#Created By : +#Created By : ################################### from argparse import ArgumentParser parser = ArgumentParser() -parser.add_argument("-i", "--infile", type=str, - help="Tab-delimited file with gene_ids in first column and gene_annotations in second column") -parser.add_argument("-m", "--mapfile", type=str, - help="Tab-delimited file mapping each gene_annotation to the nearest parent hierarchy level (e.g. pathway to enzyme)") -parser.add_argument("-H", "--hierarchy", type=str, - help="Hierarchy file for parent levels") -parser.add_argument("-l", "--limit", type=str, - help="Limit calculations to only this list of parent hierarchies. E.g. a list of predicted pathways only") -parser.add_argument("-O", "--operation", type=str, default="mean", - help="Specify how to do calculations, either 'mean' (default), 'sum' or 'meanhalf' where averages will be calculated from the most abundant half of annotations") -parser.add_argument("-o", "--outfile", type=str, - help="Write results to outfile. Defaults to stdout") -parser.add_argument("-n", "--name", type=str, - help="OPTIONAL: Name to assign to outfile. Defaults to name of infile") -parser.add_argument("-c", "--coverage", type=str, - help="OPTIONAL: Supply a file of coverage for each gene in the infile") -parser.add_argument("-v", "--verbose", action="store_true", - help="Run in verbose mode") -parser.add_argument("-s", "--singlecol", action="store_true", - help="Write only one annotation column for the first parent hierarchy (e.g. pathway)") -parser.add_argument("-d", "--sdev", type=str, - help="Write the standard deviation of each first parent hierarchy level to this file") -parser.add_argument("-L", "--lengthnorm", type=str, - help="Provide file with lengths for genes to normalize coverage by") +parser.add_argument( + "-i", "--infile", type=str, + help="Tab-delimited file with gene_ids in first column and gene_annotations in second column") +parser.add_argument( + "-m", "--mapfile", type=str, + help="Tab-delimited file mapping each gene_annotation to the nearest parent hierarchy level (e.g. pathway to enzyme)") +parser.add_argument( + "-H", "--hierarchy", type=str, + help="Hierarchy file for parent levels") +parser.add_argument( + "-l", "--limit", type=str, + help="Limit calculations to only this list of parent hierarchies. E.g. a list of predicted pathways only") +parser.add_argument( + "-O", "--operation", type=str, default="mean", + help="Specify how to do calculations, either 'mean' (default), 'sum' or 'meanhalf' where averages will be calculated from the most abundant half of annotations") +parser.add_argument( + "-o", "--outfile", type=str, + help="Write results to outfile. Defaults to stdout") +parser.add_argument( + "-n", "--name", type=str, + help="OPTIONAL: Name to assign to outfile. Defaults to name of infile") +parser.add_argument( + "-c", "--coverage", type=str, + help="OPTIONAL: Supply a file of coverage for each gene in the infile") +parser.add_argument( + "-v", "--verbose", action="store_true", + help="Run in verbose mode") +parser.add_argument( + "-s", "--singlecol", action="store_true", + help="Write only one annotation column for the first parent hierarchy (e.g. pathway)") +parser.add_argument( + "-d", "--sdev", type=str, + help="Write the standard deviation of each first parent hierarchy level to this file") +parser.add_argument( + "-L", "--lengthnorm", type=str, + help="Provide file with lengths for genes to normalize coverage by") args = parser.parse_args() -import sys,csv, numpy as np +import sys +import csv +import numpy as np + +if not args.infile or not args.mapfile or not args.hierarchy: + sys.exit(parser.print_help()) -if not args.infile or not args.mapfile or not args.hierarchy: sys.exit(parser.print_help()) def ReadLengths(f): hin = open(f) d = {} for line in hin: line = line.rstrip() - [g,l] = line.rsplit() + [g, l] = line.rsplit() l = float(l) d[g] = l return d + def ReadLimits(f): limit = {} hin = open(f) @@ -64,135 +81,180 @@ def ReadLimits(f): hin.close() return limit + def ReadMap(f): d = {} hin = open(f) - hincsv = csv.reader(hin, delimiter = '\t') - for row in hincsv: + hincsv = csv.reader(hin, delimiter='\t') + for row in hincsv: ann = row[1] parent = row[0] - try: d[ann].append(parent) - except KeyError: d[ann] = [parent] + try: + d[ann].append(parent) + except KeyError: + d[ann] = [parent] hin.close() return d + def ReadCoverage(f, lengths): d = {} - try: hin = open(f, 'r') - except TypeError: return {} - hincsv = csv.reader(hin, delimiter = '\t') - for row in hincsv: + try: + hin = open(f, 'r') + except TypeError: + return {} + hincsv = csv.reader(hin, delimiter='\t') + for row in hincsv: g = row[0] - try: c = float(row[1]) - except ValueError: continue - if len(lengths.keys())>0: - try: l = lengths[g] - except KeyError: sys.exit("No length found for gene "+g+"\n") - else: l = 1 + try: + c = float(row[1]) + except ValueError: + continue + if len(lengths.keys()) > 0: + try: + l = lengths[g] + except KeyError: + sys.exit("No length found for gene "+g+"\n") + else: + l = 1 c = c/l d[g] = c hin.close() return d + def ReadHierarchy(f): d = {} hin = open(f) - hincsv = csv.reader(hin, delimiter = '\t') + hincsv = csv.reader(hin, delimiter='\t') l = [] for row in hincsv: id = row[0] name = row[1] - try: hiers = row[2] - except IndexError: hiers = "Unknown" + try: + hiers = row[2] + except IndexError: + hiers = "Unknown" d[id] = hiers.split("|")+[name] l.append(len(d[id])) hin.close() - return (max(l),d) + return (max(l), d) + def Calculate(hier_c, operation): hier_sdev = {} - if operation == "sum": function = np.sum - elif operation == "mean" or operation == "meanhalf": function = np.mean - for hier, l in hier_c.iteritems(): + if operation == "sum": + function = np.sum + elif operation == "mean" or operation == "meanhalf": + function = np.mean + for hier, l in hier_c.iteritems(): hier_sdev[hier] = 0.0 if operation == "meanhalf": l.sort() l = l[len(l)/2:] hier_c[hier] = function(l) hier_sdev[hier] = np.std(l) - return (hier_sdev,hier_c) + return (hier_sdev, hier_c) + def CalcHierarchy(mapping, annotations, coverage, operation, limit): - ## Iterate annotations, and sum coverage if available + # # Iterate annotations, and sum coverage if available ann_c = {} - if len(coverage.keys()) > 0: cov = True - else: cov = False + if len(coverage.keys()) > 0: + cov = True + else: + cov = False for annotation, l in annotations.iteritems(): covlist = [] if cov: for gene in l: - try: gene_cov = coverage[gene] - except KeyError: sys.exit("ERROR: Could not find coverage for gene "+str(gene)+". Are you sure you have coverage information?\n") + try: + gene_cov = coverage[gene] + except KeyError: + sys.exit("ERROR: Could not find coverage for gene "+str(gene)+". Are you sure you have coverage information?\n") covlist.append(gene_cov) ann_c[annotation] = np.mean(covlist) - else: ann_c[annotation] = len(l) - ## Transfer annotation sums to nearest parent in mapping, if limit is supplied skip parents in limit + else: + ann_c[annotation] = len(l) + # # Transfer annotation sums to nearest parent in mapping, if limit is supplied skip parents in limit hier_c = {} for annotation, count in ann_c.iteritems(): - - try: parents = mapping[annotation] - except KeyError: - if args.verbose: sys.stderr.write("WARNING: Could not find hierarchy parent for "+annotation+"\n") + + try: + parents = mapping[annotation] + except KeyError: + if args.verbose: + sys.stderr.write("WARNING: Could not find hierarchy parent for "+annotation+"\n") continue - for parent in parents: - if limit and not parent in limit: - if args.verbose: sys.stderr.write("Skipping parent "+ parent+"\n") + for parent in parents: + if limit and parent not in limit: + if args.verbose: + sys.stderr.write("Skipping parent "+ parent+"\n") continue - try: hier_c[parent].append(count) - except KeyError: hier_c[parent] = [count] - (hier_sdev,hier_c) = Calculate(hier_c, operation) - return (hier_sdev,hier_c) + try: + hier_c[parent].append(count) + except KeyError: + hier_c[parent] = [count] + (hier_sdev, hier_c) = Calculate(hier_c, operation) + return (hier_sdev, hier_c) -if args.limit: limit = ReadLimits(args.limit) -else: limit = [] +if args.limit: + limit = ReadLimits(args.limit) +else: + limit = [] lengths = {} -if args.lengthnorm: lengths = ReadLengths(args.lengthnorm) +if args.lengthnorm: + lengths = ReadLengths(args.lengthnorm) mapping = ReadMap(args.mapfile) -annotations = ReadMap(args.infile) ## Read annotations the same way as above, then get length of the list for counts -coverage = ReadCoverage(args.coverage, lengths) +annotations = ReadMap(args.infile) # # Read annotations the same way as above, then get length of the list for counts +coverage = ReadCoverage(args.coverage, lengths) (max_hier, hierarchy) = ReadHierarchy(args.hierarchy) (hier_sdev, hier_counts) = CalcHierarchy(mapping, annotations, coverage, args.operation, limit) -if args.outfile: hout = open(args.outfile, 'w') -else: hout = sys.stdout -houtcsv = csv.writer(hout, delimiter = '\t') +if args.outfile: + hout = open(args.outfile, 'w') +else: + hout = sys.stdout +houtcsv = csv.writer(hout, delimiter='\t') -if args.name: name = args.name -else: name = (args.infile).split("/")[-1] +if args.name: + name = args.name +else: + name = (args.infile).split("/")[-1] out = [name] -if args.singlecol:out.insert(0,"X") +if args.singlecol: + out.insert(0, "X") else: - for i in range(1,max_hier+1): out.append("Level"+str(i)) + for i in range(1, max_hier+1): + out.append("Level" + str(i)) houtcsv.writerow(out) -if args.sdev: +if args.sdev: sdevout = open(args.sdev, 'w') sdevout.write("X.sdev\t"+name+"\n") -for hier,count in hier_counts.iteritems(): +for hier, count in hier_counts.iteritems(): out = [count] - try: h = hierarchy[hier] - except KeyError: h = ["Unknown"] - if args.singlecol: out.insert(0,hier) - else: out+=hierarchy[hier] - try: sdevout.write(hier+"\t"+str(hier_sdev[hier])+"\n") - except NameError: pass + try: + h = hierarchy[hier] + except KeyError: + h = ["Unknown"] + if args.singlecol: + out.insert(0, hier) + else: + out += hierarchy[hier] + try: + sdevout.write(hier+"\t"+str(hier_sdev[hier])+"\n") + except NameError: + pass houtcsv.writerow(out) hout.close() -try: sdevout.close() -except NameError: pass +try: + sdevout.close() +except NameError: + pass