From 5d62d8f6d41bccac77a549bed83ec8deaa3762ac Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Mon, 16 Aug 2010 13:22:16 -0400
Subject: [PATCH] First commit for unionBedGraphs and multi-column groupBy.

---
 Makefile                                     |  35 +-
 src/genomeCoverageBed/genomeCoverageMain.cpp |   2 +-
 src/groupBy/groupBy.cpp                      | 380 +++++++++++--------
 src/unionBedGraphs/Makefile                  |  48 +++
 src/unionBedGraphs/intervalItem.h            |  62 +++
 src/unionBedGraphs/unionBedGraphs.cpp        | 255 +++++++++++++
 src/unionBedGraphs/unionBedGraphs.h          | 123 ++++++
 src/unionBedGraphs/unionBedGraphsMain.cpp    | 271 +++++++++++++
 src/utils/fileType/fileType.cpp              |  18 +-
 src/utils/fileType/fileType.h                |   4 +
 src/utils/tabFile/tabFile.cpp                |   8 +-
 src/utils/tabFile/tabFile.h                  |   3 +-
 12 files changed, 1029 insertions(+), 180 deletions(-)
 create mode 100755 src/unionBedGraphs/Makefile
 create mode 100644 src/unionBedGraphs/intervalItem.h
 create mode 100644 src/unionBedGraphs/unionBedGraphs.cpp
 create mode 100644 src/unionBedGraphs/unionBedGraphs.h
 create mode 100644 src/unionBedGraphs/unionBedGraphsMain.cpp

diff --git a/Makefile b/Makefile
index c9629072..6cb24583 100644
--- a/Makefile
+++ b/Makefile
@@ -11,9 +11,39 @@ export CXX      = g++
 export CXXFLAGS = -Wall -O2
 export LIBS     = -lz
 
-SUBDIRS = $(SRC_DIR)/bamToBed $(SRC_DIR)/bedToBam $(SRC_DIR)/bedToIgv $(SRC_DIR)/bed12ToBed6 $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/groupBy $(SRC_DIR)/intersectBed $(SRC_DIR)/linksBed $(SRC_DIR)/maskFastaFromBed $(SRC_DIR)/mergeBed   $(SRC_DIR)/overlap $(SRC_DIR)/pairToBed $(SRC_DIR)/pairToPair $(SRC_DIR)/shuffleBed $(SRC_DIR)/slopBed $(SRC_DIR)/sortBed $(SRC_DIR)/subtractBed $(SRC_DIR)/windowBed    
+SUBDIRS = $(SRC_DIR)/bamToBed \
+          $(SRC_DIR)/bedToBam \
+          $(SRC_DIR)/bedToIgv \
+          $(SRC_DIR)/bed12ToBed6 \
+          $(SRC_DIR)/closestBed \
+          $(SRC_DIR)/complementBed \
+          $(SRC_DIR)/coverageBed \
+          $(SRC_DIR)/fastaFromBed \
+          $(SRC_DIR)/genomeCoverageBed \
+          $(SRC_DIR)/groupBy \
+          $(SRC_DIR)/intersectBed \
+          $(SRC_DIR)/linksBed \
+          $(SRC_DIR)/maskFastaFromBed \
+          $(SRC_DIR)/mergeBed   \
+          $(SRC_DIR)/overlap \
+          $(SRC_DIR)/pairToBed \
+          $(SRC_DIR)/pairToPair \
+          $(SRC_DIR)/shuffleBed \
+          $(SRC_DIR)/slopBed \
+          $(SRC_DIR)/sortBed \
+          $(SRC_DIR)/subtractBed \
+          $(SRC_DIR)/unionBedGraphs \
+          $(SRC_DIR)/windowBed
 
-UTIL_SUBDIRS =  $(SRC_DIR)/utils/lineFileUtilities $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/tabFile $(SRC_DIR)/utils/genomeFile $(SRC_DIR)/utils/gzstream $(SRC_DIR)/utils/fileType $(SRC_DIR)/utils/bedFilePE $(SRC_DIR)/utils/sequenceUtilities $(SRC_DIR)/utils/BamTools
+UTIL_SUBDIRS =  $(SRC_DIR)/utils/lineFileUtilities \
+                $(SRC_DIR)/utils/bedFile \
+                $(SRC_DIR)/utils/tabFile \
+                $(SRC_DIR)/utils/genomeFile \
+                $(SRC_DIR)/utils/gzstream \
+                $(SRC_DIR)/utils/fileType \
+                $(SRC_DIR)/utils/bedFilePE \
+                $(SRC_DIR)/utils/sequenceUtilities \
+                $(SRC_DIR)/utils/BamTools
 
 all:
 
@@ -33,7 +63,6 @@ all:
 	done
 
 
-
 .PHONY: all
 
 clean:
diff --git a/src/genomeCoverageBed/genomeCoverageMain.cpp b/src/genomeCoverageBed/genomeCoverageMain.cpp
index 40c33931..d1be3bac 100644
--- a/src/genomeCoverageBed/genomeCoverageMain.cpp
+++ b/src/genomeCoverageBed/genomeCoverageMain.cpp
@@ -159,7 +159,7 @@ void ShowHelp(void) {
 	cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
 	
 	cerr << "Authors: Aaron Quinlan (aaronquinlan@gmail.com)" << endl;
-	cerr << "         Gordon Assaf, CSHL" << endl << endl;
+	cerr << "         Assaf Gordon, CSHL" << endl << endl;
 
 	cerr << "Summary: Compute the coverage of a feature file among a genome." << endl << endl;
 
diff --git a/src/groupBy/groupBy.cpp b/src/groupBy/groupBy.cpp
index 15c331d5..000e0687 100644
--- a/src/groupBy/groupBy.cpp
+++ b/src/groupBy/groupBy.cpp
@@ -1,5 +1,5 @@
 /*****************************************************************************
-  overlap.cpp
+  groupBy.cpp
 
   (c) 2009 - Aaron Quinlan
   Hall Laboratory
@@ -21,6 +21,8 @@
 #include <math.h>
 #include <limits.h>
 #include <string.h>
+#include <exception>
+#include <stdexcept> // out_of_range exception
 
 #include "version.h"
 #include "lineFileUtilities.h"
@@ -36,11 +38,13 @@ using namespace std;
 
 // function declarations
 void ShowHelp(void);
-void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn, string op);
-void ReportSummary(const vector<string> &group, const vector<string> &data, string op);
+void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops);
+void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops);
+void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum);
 float ToFloat (string element);
 double ToDouble(const string &element);
-void TabPrint (string element);
+void TabPrintPost (string element);
+void TabPrintPre (string element);
 void CommaPrint (string element);
             
 int main(int argc, char* argv[]) {
@@ -48,15 +52,15 @@ int main(int argc, char* argv[]) {
 	// input files
 	string inFile;
 	string groupColumnsString = "1,2,3";
-    string opColumnString;
-    string op = "sum";
+    string opsColumnString;
+    string opsString;
     
 	// our configuration variables
-	bool showHelp         = false;
-	bool haveInFile       = false;
-	bool haveGroupColumns = false;
-	bool haveOpColumn     = false;	
-	bool haveOp           = true;	
+	bool showHelp          = false;
+	bool haveInFile        = false;
+	bool haveGroupColumns  = false;
+	bool haveOpColumns     = false;	
+	bool haveOps           = true;	
 	
 	// check to see if we should print out some help
 	if(argc <= 1) showHelp = true;
@@ -89,14 +93,14 @@ int main(int argc, char* argv[]) {
 			groupColumnsString     = argv[i + 1];
 			i++;			
 		}
-		else if(PARAMETER_CHECK("-opCol", 6, parameterLength)) {
-			haveOpColumn       = true;
-			opColumnString     = argv[i + 1];
+		else if(PARAMETER_CHECK("-opCols", 7, parameterLength)) {
+			haveOpColumns       = true;
+			opsColumnString     = argv[i + 1];
 			i++;			
 		}
-		else if(PARAMETER_CHECK("-op", 3, parameterLength)) {
-			haveOp = true;
-			op     = argv[i + 1];
+		else if(PARAMETER_CHECK("-ops", 4, parameterLength)) {
+			haveOps    = true;
+			opsString  = argv[i + 1];
 			i++;			
 		}
 		else {
@@ -110,16 +114,21 @@ int main(int argc, char* argv[]) {
 		cerr << endl << "*****" << endl << "*****ERROR: Need -i file. " << endl << "*****" << endl;
 		showHelp = true;
 	}
-	if (!haveOpColumn) {
+	if (!haveOpColumns) {
 		cerr << endl << "*****" << endl << "*****ERROR: Need -opCol." << endl << "*****" << endl;
 		showHelp = true;
 	}
-	if ((op != "sum")  && (op != "max")    && (op != "min") && (op != "mean") &&
-	    (op != "mode") && (op != "median") && (op != "antimode") && (op != "stdev") &&
-	    (op != "count") && (op != "collapse")) {
-		    cerr << endl << "*****" << endl << "*****ERROR: Invalid op selection. " << endl << "*****" << endl;	
-		    showHelp = true;		                    
-	}
+	// split the opsString into discrete operations and make sure they are all valid.
+	vector<string> ops;
+    Tokenize(opsString, ops, ",");
+    for( size_t i = 0; i < ops.size(); i++ ) {
+        if ((ops[i] != "sum")  && (ops[i] != "max")    && (ops[i] != "min") && (ops[i] != "mean") &&
+            (ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") &&
+            (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse")) {
+            cerr << endl << "*****" << endl << "*****ERROR: Invalid op selection \"" << ops[i] << endl << "*****" << endl;	
+            showHelp = true;		                    
+        }
+    }
 	if (!showHelp) {
 		
 		// Split the column string sent by the user into discrete column numbers
@@ -127,25 +136,33 @@ int main(int argc, char* argv[]) {
 		vector<int> groupColumnsInt;
 		Tokenize(groupColumnsString, groupColumnsInt, ",");
 		
-		// sanity check the group columns
-        vector<int>::const_iterator gIt  = groupColumnsInt.begin();
-        vector<int>::const_iterator gEnd = groupColumnsInt.end();
-        for (; gIt != gEnd; ++gIt) {
-            if (*gIt < 1) {
-    		    cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl;	
-    		    ShowHelp();                
+		vector<int> opColumnsInt;
+        Tokenize(opsColumnString, opColumnsInt, ",");
+        
+        // sanity check the group columns        
+        for(size_t i = 0; i < groupColumnsInt.size(); ++i) {
+            int groupColumnInt = groupColumnsInt[i];
+            if (groupColumnInt < 1) {
+                cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl;	
+                ShowHelp();                
             }
         }
         
         // sanity check the op columns        
-        int opColumnInt = atoi(opColumnString.c_str());
-        if (opColumnInt < 1) {
-		    cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl;	
-		    ShowHelp();                
+        for(size_t i = 0; i < opColumnsInt.size(); ++i) {
+            int opColumnInt = opColumnsInt[i];
+            if (opColumnInt < 1) {
+                cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl;	
+                ShowHelp();                
+            }
         }
-
-		//DetermineInput(inFile, groupColumnsInt, opColumnInt, op);
-		GroupBy(inFile, groupColumnsInt, opColumnInt, op);
+        
+        // sanity check that there are equal number of opColumns and ops
+        if (ops.size() != opColumnsInt.size()) {
+            cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl;	
+            ShowHelp();                
+        }
+		GroupBy(inFile, groupColumnsInt, opColumnsInt, ops);
 	}	
 	else {
 		ShowHelp();
@@ -201,7 +218,7 @@ void ShowHelp(void) {
 }
 
 
-void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn, string op) {
+void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops) {
 	
 	// current line number
 	int lineNum = 0;
@@ -216,9 +233,11 @@ void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn
     vector<string>  prevGroup(0);
     vector<string>  currGroup(0);
     
-    // vector of the opColumn values for the current group
-    vector<string>  values;
-    values.reserve(100000);
+    // vector (one per column) of vector (one per value/column) of the opColumn values for the current group
+    vector< vector<string> >  values;
+    for( size_t i = 0; i < opColumns.size(); i++ ) {
+        values.push_back( vector<string>() );
+    }
     
     // check the status of the current line
     TabLineStatus tabLineStatus;
@@ -230,154 +249,177 @@ void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn
     _tab->Open();
 	while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) {
 		if (tabLineStatus == TAB_VALID) {
-		    
-		    // grab the number of columns in the line.
-            int lineWidth = inFields.size();
-		    
-		    // sanity check the op column
-		    if (opColumn > lineWidth) {
-		        cerr << endl << "*****" << endl << "*****ERROR: op column exceeds the number of columns in file at line " 
-		             << lineNum << ". Exiting." << endl << "*****" << endl;	
-    		    exit(1);
-		    }
-		    
 		    // build the group vector for the current line
 		    currGroup.clear();
             vector<int>::const_iterator gIt  = groupColumns.begin();
             vector<int>::const_iterator gEnd = groupColumns.end();
-            for (; gIt != gEnd; ++gIt) {
-                if (*gIt > lineWidth) {
-    		        cerr << endl << "*****" << endl << "*****ERROR: group column exceeds the number of columns in file at line " 
-    		             << lineNum << ". Exiting." << endl << "*****" << endl;	
-        		    exit(1);
-    		    }
-                currGroup.push_back(inFields[*gIt-1]);
-            }
+            for (; gIt != gEnd; ++gIt)
+                addValue(inFields, currGroup, (*gIt-1), lineNum);
 
             // there has been a group change
             if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
                 // Summarize this group
-                ReportSummary(prevGroup, values, op);
+                ReportSummary(prevGroup, values, ops);
+                // reset and add the first value for the next group.
                 values.clear();
-                values.push_back(inFields[opColumn-1].c_str());
+                for( size_t i = 0; i < opColumns.size(); i++ ) {
+                    values.push_back( vector<string>() );
+        		    addValue(inFields, values[i], opColumns[i]-1, lineNum);
+                }
             }
             // we're still dealing with the same group
-            else
-                values.push_back(inFields[opColumn-1].c_str());
-
+            else {
+                for( size_t i = 0; i < opColumns.size(); i++ )
+        		    addValue(inFields, values[i], opColumns[i]-1, lineNum);
+	        }
             // reset for the next line
             prevGroup = currGroup;
-            inFields.clear();
 	    }
+	    inFields.clear();
     }
     // report the last group
-    ReportSummary(currGroup, values, op);
+    ReportSummary(currGroup, values, ops);
     _tab->Close();
 }
 
 
-void ReportSummary(const vector<string> &group, const vector<string> &data, string op) {
+void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops) {
     
-    vector<double> dataF;
-    // are we doing a numeric conversion?  if so, convert the strings to doubles.
-    if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") ||
-	    (op == "median") || (op == "stdev")) {
-            transform(data.begin(), data.end(), back_inserter(dataF), ToDouble);
-    }
+    vector<string> result;
+    for( size_t i = 0; i < data.size(); i++ ) {
+        
+        string op = ops[i];
+        std::stringstream buffer;
+        vector<double> dataF;
+        // are we doing a numeric conversion?  if so, convert the strings to doubles.
+        if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") ||
+    	    (op == "median") || (op == "stdev") || (op == "sstdev")) 
+    	{
+            transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble);
+        }
     
-    if (op == "sum") {        
-        // sum them up
-        double total = accumulate(dataF.begin(), dataF.end(), 0.0);
-        for_each(group.begin(), group.end(), TabPrint);
-        cout << setprecision (7) << total << endl; 
-    }
-    if (op == "collapse") {        
-        for_each(group.begin(), group.end(), TabPrint);
-        for_each(data.begin(), data.end(), CommaPrint);
-        cout << endl;
-    }
-    else if (op == "min") {
-        for_each(group.begin(), group.end(), TabPrint);
-        cout << setprecision (7) << *min_element( dataF.begin(), dataF.end() ) << endl;
-    }
-    else if (op == "max") {
-        for_each(group.begin(), group.end(), TabPrint);
-        cout << setprecision (7) << *max_element( dataF.begin(), dataF.end() ) << endl;
-    }
-    else if (op == "mean") {
-        double total = accumulate(dataF.begin(), dataF.end(), 0.0);
-        double mean = total / dataF.size();
-        for_each(group.begin(), group.end(), TabPrint);
-        cout << setprecision (7) << mean << endl;
-    }
-    else if (op == "median") {
-        double median = 0.0;
-		sort(dataF.begin(), dataF.end());
-        int totalLines = dataF.size();
-		if ((totalLines % 2) > 0) {
-			long mid;
-			mid = totalLines / 2;
-			median = dataF[mid];
-		}
-		else {
-			long midLow, midHigh;
-			midLow = (totalLines / 2) - 1;
-			midHigh = (totalLines / 2);
-			median = (dataF[midLow] + dataF[midHigh]) / 2.0;
-		}
-		for_each(group.begin(), group.end(), TabPrint);
-        cout << setprecision (7) << median << endl;
-    }
-    else if (op == "count") {
-        for_each(group.begin(), group.end(), TabPrint);
-        cout << setprecision (7) << data.size() << endl;
-    }
-    else if ((op == "mode") || (op == "antimode")) {
-        // compute the frequency of each unique value
-        map<string, int> freqs;
-        vector<string>::const_iterator dIt  = data.begin();
-        vector<string>::const_iterator dEnd = data.end();
-        for (; dIt != dEnd; ++dIt) {
-            freqs[*dIt]++;
+        if (op == "sum") {        
+            // sum them up
+            double total = accumulate(dataF.begin(), dataF.end(), 0.0);
+            buffer << setprecision (7) << total;
+            result.push_back(buffer.str());
         }
-        
-        // grab the mode and the anti mode
-        string mode, antiMode;
-        int    count = 0;
-        int minCount = INT_MAX;
-        for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
-			if (iter->second > count) {
-				mode = iter->first;
-				count = iter->second;
-			}
-			if (iter->second < minCount) {
-				antiMode = iter->first;
-				minCount = iter->second;
-			}
-		}
-		// report
-        for_each(group.begin(), group.end(), TabPrint);
-        if (op == "mode")
-            cout << setprecision (7) << mode << endl;
-        else if (op == "antimode")
-            cout << setprecision (7) << antiMode << endl;
-    }
-    else if (op == "stdev") {
-        // get the mean
-        double total = accumulate(dataF.begin(), dataF.end(), 0.0);
-        double mean = total / dataF.size();
-        // get the variance
-        double totalVariance = 0.0;
-        vector<double>::const_iterator dIt  = dataF.begin();
-        vector<double>::const_iterator dEnd = dataF.end();
-        for (; dIt != dEnd; ++dIt) {
-        	totalVariance += pow((*dIt - mean),2);
+        if (op == "collapse") {        
+            string collapse;
+            for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
+                collapse.append(data[i][j]);
+                collapse.append(",");
+            }
+            result.push_back(collapse);
+        }
+        else if (op == "min") {
+            buffer << setprecision (7) << *min_element( dataF.begin(), dataF.end() );
+            result.push_back(buffer.str());
+        }
+        else if (op == "max") {
+            buffer << setprecision (7) << *max_element( dataF.begin(), dataF.end() );
+            result.push_back(buffer.str());
+        }
+        else if (op == "mean") {
+            double total = accumulate(dataF.begin(), dataF.end(), 0.0);
+            double mean = total / dataF.size();
+            buffer << setprecision (7) << mean;
+            result.push_back(buffer.str());
+        }
+        else if (op == "median") {
+            double median = 0.0;
+            sort(dataF.begin(), dataF.end());
+            int totalLines = dataF.size();
+            if ((totalLines % 2) > 0) {
+                long mid;
+                mid = totalLines / 2;
+                median = dataF[mid];
+            }
+            else {
+                long midLow, midHigh;
+                midLow = (totalLines / 2) - 1;
+                midHigh = (totalLines / 2);
+                median = (dataF[midLow] + dataF[midHigh]) / 2.0;
+            }
+            buffer << setprecision (7) << median;
+            result.push_back(buffer.str());
+        }
+        else if (op == "count") {
+            buffer << setprecision (7) << data[i].size();
+            result.push_back(buffer.str());
+        }
+        else if ((op == "mode") || (op == "antimode")) {
+            // compute the frequency of each unique value
+            map<string, int> freqs;
+            vector<string>::const_iterator dIt  = data[i].begin();
+            vector<string>::const_iterator dEnd = data[i].end();
+            for (; dIt != dEnd; ++dIt) {
+                freqs[*dIt]++;
+            }
+
+            // grab the mode and the anti mode
+            string mode, antiMode;
+            int    count = 0;
+            int minCount = INT_MAX;
+            for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
+                if (iter->second > count) {
+                    mode = iter->first;
+                    count = iter->second;
+                }
+                if (iter->second < minCount) {
+                    antiMode = iter->first;
+                    minCount = iter->second;
+                }
+            }
+            // report
+            if (op == "mode") {
+                buffer << setprecision (7) << mode;
+                result.push_back(buffer.str());
+            }
+            else if (op == "antimode") {
+                buffer << setprecision (7) << antiMode;
+                result.push_back(buffer.str());
+            }        
+        }
+        else if (op == "stdev" || op == "sstdev") {
+            // get the mean
+            double total = accumulate(dataF.begin(), dataF.end(), 0.0);
+            double mean = total / dataF.size();
+            // get the variance
+            double totalVariance = 0.0;
+            vector<double>::const_iterator dIt  = dataF.begin();
+            vector<double>::const_iterator dEnd = dataF.end();
+            for (; dIt != dEnd; ++dIt) {
+                totalVariance += pow((*dIt - mean),2);
+            }
+            double variance = 0.0;
+            if (op == "stdev") {
+                variance = totalVariance / dataF.size();
+            }
+            else if (op == "sstdev" && dataF.size() > 1) {
+                variance = totalVariance / (dataF.size() - 1);
+            }
+            double stddev = sqrt(variance);
+            // report
+            buffer << setprecision (7) << stddev;
+            result.push_back(buffer.str());
         }
-        double variance = totalVariance / dataF.size();
-        double stddev = sqrt(variance);
-        // report
-        for_each(group.begin(), group.end(), TabPrint);
-        cout << setprecision (7) << stddev << endl;
+    }
+    for_each(group.begin(), group.end(), TabPrintPost);
+    cout << *result.begin();
+    for_each(++result.begin(), result.end(), TabPrintPre);
+    cout << endl; //Gets rid of extraneous tab
+}
+
+
+void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum) {
+    try {
+        toList.push_back(fromList.at(index));
+    }
+    catch(std::out_of_range& e) {
+        cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line " 
+             << lineNum << ". Exiting." << endl << "*****" << endl;	
+	    exit(1);
     }
 }
 
@@ -386,10 +428,14 @@ float ToFloat (string element) {
     return atof(element.c_str());
 }
 
-void TabPrint (string element) {
+void TabPrintPost (string element) {
     cout << element << "\t";
 }
 
+void TabPrintPre (string element) {
+    cout << "\t" << element;
+}
+
 void CommaPrint (string element) {
     cout << element << ",";
 }
diff --git a/src/unionBedGraphs/Makefile b/src/unionBedGraphs/Makefile
new file mode 100755
index 00000000..00b488a6
--- /dev/null
+++ b/src/unionBedGraphs/Makefile
@@ -0,0 +1,48 @@
+UTILITIES_DIR = ../utils/
+OBJ_DIR = ../../obj/
+BIN_DIR = ../../bin/
+
+# -------------------
+# define our includes
+# -------------------
+INCLUDES = -I$(UTILITIES_DIR)/bedGraphFile/ \
+	   -I$(UTILITIES_DIR)/lineFileUtilities/ \
+	   -I$(UTILITIES_DIR)/genomeFile/ \
+	   -I$(UTILITIES_DIR)/version/ \
+	   -I$(UTILITIES_DIR)/gzstream/ \
+	   -I$(UTILITIES_DIR)/fileType/
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= unionBedGraphs.cpp unionBedGraphsMain.cpp
+OBJECTS= $(SOURCES:.cpp=.o)
+_EXT_OBJECTS=bedGraphFile.o genomeFile.o lineFileUtilities.o gzstream.o fileType.o
+EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+PROGRAM= unionBedGraphs
+
+all: $(PROGRAM)
+
+.PHONY: all
+
+$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
+	@echo "  * linking $(PROGRAM)"
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
+
+$(BUILT_OBJECTS): $(SOURCES)
+	@echo "  * compiling" $(*F).cpp
+	@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+$(EXT_OBJECTS):
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedGraphFile/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/genomeFile/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/	
+
+clean:
+	@echo "Cleaning up."
+	@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
+
+.PHONY: clean
diff --git a/src/unionBedGraphs/intervalItem.h b/src/unionBedGraphs/intervalItem.h
new file mode 100644
index 00000000..f356ab83
--- /dev/null
+++ b/src/unionBedGraphs/intervalItem.h
@@ -0,0 +1,62 @@
+/*****************************************************************************
+  intervalItem.h
+
+  (c) 2010 - Assaf Gordon
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0+ license.
+******************************************************************************/
+#ifndef INTERVALITEM_H
+#define INTERVALITEM_H
+
+#include <string>
+#include <queue>
+
+enum COORDINATE_TYPE {
+	START,
+	END
+};
+
+/*
+   An interval item in the priority queue.
+
+   An IntervalItem can mark either a START position or an END position.
+ */
+class IntervalItem
+{
+private:
+	IntervalItem();
+
+public:
+	int source_index;           // which source BedGraph file this came from
+	COORDINATE_TYPE coord_type; // is this the start or the end position?
+	CHRPOS coord;
+	std::string depth;
+
+	IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord, std::string _depth) :
+		source_index(_index),
+		coord_type(_type),
+		coord(_coord),
+		depth(_depth)
+	{}
+
+	IntervalItem(const IntervalItem &other) :
+		source_index(other.source_index),
+		coord_type(other.coord_type),
+		coord(other.coord),
+		depth(other.depth)
+	{}
+
+	bool operator< ( const IntervalItem& other ) const
+	{
+		return this->coord > other.coord;
+	}
+};
+
+// our priority queue
+typedef std::priority_queue<IntervalItem> INTERVALS_PRIORITY_QUEUE;
+
+#endif
diff --git a/src/unionBedGraphs/unionBedGraphs.cpp b/src/unionBedGraphs/unionBedGraphs.cpp
new file mode 100644
index 00000000..5e129045
--- /dev/null
+++ b/src/unionBedGraphs/unionBedGraphs.cpp
@@ -0,0 +1,255 @@
+/*****************************************************************************
+  unionBedGraphs.cpp
+
+  (c) 2010 - Assaf Gordon, CSHL
+           - Aaron Quinlan, UVA
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0+ license.
+******************************************************************************/
+#include <cassert>
+#include <cstring>
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+
+#include "bedGraphFile.h"
+#include "unionBedGraphs.h"
+
+using namespace std;
+
+
+UnionBedGraphs::UnionBedGraphs(std::ostream& _output,
+			                const vector<string>& _filenames,
+			                const vector<string>& _titles,
+			                bool _print_empty_regions,
+			                const std::string& _genome_size_filename,
+			                const std::string& _no_coverage_value	) :
+	filenames(_filenames),
+	titles(_titles),
+	output(_output),
+	current_non_zero_inputs(0),
+	print_empty_regions(_print_empty_regions),
+	genome_sizes(NULL),
+	no_coverage_value(_no_coverage_value)
+{
+	if (print_empty_regions) {
+		assert(!_genome_size_filename.empty());
+
+		genome_sizes = new GenomeFile(_genome_size_filename);
+	}
+}
+
+
+UnionBedGraphs::~UnionBedGraphs() {
+	CloseBedgraphFiles();
+	if (genome_sizes) {
+		delete genome_sizes;
+		genome_sizes = NULL ;
+	}
+}
+
+
+void UnionBedGraphs::Union() {
+	OpenBedgraphFiles();
+
+	// Add the first interval from each file
+	for(size_t i=0;i<bedgraph_files.size();++i)
+		LoadNextBedgraphItem(i);
+
+	// Chromosome loop - once per chromosome
+	do {
+		// Find the first chromosome to use
+		current_chrom = DetermineNextChrom();
+
+		// Populate the queue with initial values from all files
+		// (if they belong to the correct chromosome)
+		for(size_t i=0;i<bedgraph_files.size();++i)
+			AddInterval(i);
+
+		CHRPOS current_start = ConsumeNextCoordinate();
+
+		// User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage
+		if (print_empty_regions && current_start > 0)
+			PrintEmptyCoverage(0,current_start);
+
+		// Intervals loop - until all intervals (of current chromosome) from all files are used.
+		do {
+			CHRPOS current_end = queue.top().coord;
+			PrintCoverage(current_start, current_end);
+			current_start = ConsumeNextCoordinate();
+		} while (!queue.empty());
+
+		// User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome
+	        // print a dummy empty coverage
+		if (print_empty_regions) {
+			CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
+			if (current_start < chrom_size)
+				PrintEmptyCoverage(current_start, chrom_size);
+		}
+
+	} while (!AllFilesDone());
+}
+
+
+CHRPOS UnionBedGraphs::ConsumeNextCoordinate() {
+	assert(!queue.empty());
+
+	CHRPOS new_position = queue.top().coord;
+	do {
+		IntervalItem item = queue.top();
+		UpdateInformation(item);
+		queue.pop();
+	} while (!queue.empty() && queue.top().coord == new_position);
+
+	return new_position;
+}
+
+
+void UnionBedGraphs::UpdateInformation(const IntervalItem &item) {
+	// Update the depth coverage for this file
+
+	// Which coordinate is it - start or end?
+	switch (item.coord_type)
+	{
+	case START:
+		current_depth[item.source_index] = item.depth;
+		current_non_zero_inputs++;
+		break;
+	case END:
+		//Read the next interval from this file
+		AddInterval(item.source_index);
+		current_depth[item.source_index] = no_coverage_value;
+		current_non_zero_inputs--;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+
+void UnionBedGraphs::PrintHeader() {
+	output << "chrom\tstart\tend" ;
+	for (size_t i=0;i<titles.size();++i)
+		output << "\t" <<titles[i];
+	output << endl;
+}
+
+
+void UnionBedGraphs::PrintCoverage(CHRPOS start, CHRPOS end) {
+	if ( current_non_zero_inputs == 0 && ! print_empty_regions )
+		return ;
+
+	output << current_chrom << "\t"
+		<< start << "\t"
+		<< end;
+
+	for (size_t i=0;i<current_depth.size();++i)
+		output << "\t" << current_depth[i] ;
+
+	output << endl;
+}
+
+
+void UnionBedGraphs::PrintEmptyCoverage(CHRPOS start, CHRPOS end) {
+	output << current_chrom << "\t"
+		<< start << "\t"
+		<< end;
+
+	for (size_t i=0;i<current_depth.size();++i)
+		output << "\t" << no_coverage_value ;
+
+	output << endl;
+}
+
+
+void UnionBedGraphs::LoadNextBedgraphItem(int index) {
+	assert(static_cast<unsigned int>(index) < bedgraph_files.size());
+
+	current_bedgraph_item[index].chrom="";
+
+	BedGraphFile *file = bedgraph_files[index];
+	BEDGRAPH_STR bg;
+	int lineNum = 0;
+	BedGraphLineStatus status;
+
+	while ( (status = file->GetNextBedGraph(bg, lineNum)) != BEDGRAPH_INVALID )  {
+		if (status != BEDGRAPH_VALID)
+			continue;
+
+		current_bedgraph_item[index] = bg ;
+		break;
+	}
+}
+
+
+bool UnionBedGraphs::AllFilesDone() {
+	for (size_t i=0;i<current_bedgraph_item.size();++i)
+		if (!current_bedgraph_item[i].chrom.empty())
+			return false;
+	return true;
+}
+
+
+string UnionBedGraphs::DetermineNextChrom() {
+	string next_chrom;
+	for (size_t i=0;i<current_bedgraph_item.size();++i) {
+		if (current_bedgraph_item[i].chrom.empty())
+			continue;
+
+		if (next_chrom.empty())
+			next_chrom = current_bedgraph_item[i].chrom;
+		else
+			if (current_bedgraph_item[i].chrom < next_chrom)
+				next_chrom = current_bedgraph_item[i].chrom ;
+	}
+	return next_chrom;
+}
+
+
+void UnionBedGraphs::AddInterval(int index) {
+	assert(static_cast<unsigned int>(index) < bedgraph_files.size());
+
+	//This file has no more intervals
+	if (current_bedgraph_item[index].chrom.empty())
+		return ;
+
+	//If the next interval belongs to a different chrom, don't add it
+	if (current_bedgraph_item[index].chrom!=current_chrom)
+		return ;
+
+	const BEDGRAPH_STR &bg(current_bedgraph_item[index]);
+
+	IntervalItem start_item(index, START, bg.start, bg.depth);
+	IntervalItem end_item(index, END, bg.end, bg.depth);
+
+	queue.push(start_item);
+	queue.push(end_item);
+
+	LoadNextBedgraphItem(index);
+}
+
+
+void UnionBedGraphs::OpenBedgraphFiles() {
+	for (size_t i=0;i<filenames.size();++i) {
+		BedGraphFile *file = new BedGraphFile(filenames[i]);
+		file->Open();
+		bedgraph_files.push_back(file);
+
+		current_depth.push_back(no_coverage_value);
+	}
+	current_bedgraph_item.resize(filenames.size());
+}
+
+
+void UnionBedGraphs::CloseBedgraphFiles() {
+	for (size_t i=0;i<bedgraph_files.size();++i) {
+		BedGraphFile *file = bedgraph_files[i];
+		delete file;
+		bedgraph_files[i] = NULL ;
+	}
+	bedgraph_files.clear();
+}
diff --git a/src/unionBedGraphs/unionBedGraphs.h b/src/unionBedGraphs/unionBedGraphs.h
new file mode 100644
index 00000000..765a229c
--- /dev/null
+++ b/src/unionBedGraphs/unionBedGraphs.h
@@ -0,0 +1,123 @@
+/*****************************************************************************
+  unionBedGraphs.h
+
+  (c) 2010 - Assaf Gordon, CSHL
+           - Aaron Quinlan, UVA
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0+ license.
+******************************************************************************/
+#ifndef UNIONBEDGRAPHS_H
+#define UNIONBEDGRAPHS_H
+
+#include <vector>
+#include <string>
+#include "bedGraphFile.h"
+#include "genomeFile.h"
+#include "intervalItem.h"
+
+class UnionBedGraphs
+{
+private:
+	typedef BEDGRAPH_STR BEDGRAPH_TYPE;
+
+	vector<string>	filenames;
+	vector<string>	titles;
+
+	vector<BedGraphFile*>               bedgraph_files;
+	vector<BEDGRAPH_TYPE::DEPTH_TYPE>   current_depth;
+	vector<BEDGRAPH_TYPE>               current_bedgraph_item;
+
+	std::ostream	&output;
+
+	INTERVALS_PRIORITY_QUEUE queue;
+	std::string	             current_chrom;
+	int		                 current_non_zero_inputs;
+	bool		             print_empty_regions;
+
+	GenomeFile*	genome_sizes;
+
+	std::string	no_coverage_value;
+
+public:
+	UnionBedGraphs(std::ostream& _output,
+			const vector<string>& _filenames,
+			const vector<string>& _titles,
+			bool _print_empty_regions,
+			const std::string& _genomeFileName,
+			const std::string& _no_coverage_value);
+
+	virtual ~UnionBedGraphs();
+
+	// Combines all bedgraph files
+	void Union();
+
+	// Print the header line: chrom/start/end + name of each bedgraph file.
+	void PrintHeader();
+
+
+private:
+	
+	// Open all BedGraph files, initialize "current_XXX" vectors
+	void OpenBedgraphFiles();
+
+    // Close the BedGraph files.
+	void CloseBedgraphFiles();
+
+	/*
+	   Add an interval from BedGraph file 'index' into the queue.
+	   will only be added if it belongs to the current chromosome.
+
+	   If the interval was added (=consumed), the next interval will be read from the file
+	   using 'LoadNextBedgraphItem'
+	 */
+	void AddInterval(int index);
+
+	/*
+	   Loads the next interval from BedGraph file 'index'.
+	   Stores it in 'current_bedgraph_item' vector.
+	 */
+	void LoadNextBedgraphItem(int index);
+
+	/*
+	   Scans the 'current_bedgraph_item' vector,
+	   find the 'first' chromosome to use (different BedGraph files can start with different chromosomes).
+	 */
+	std::string DetermineNextChrom();
+
+	/*
+	   Returns 'true' if ALL intervals from ALL BedGraph files were used
+	*/
+	bool	    AllFilesDone();
+
+	/*
+	   Extract the next coordinate from the queue, and updates the current coverage information.
+	   If multiple interval share the same coordinate values, all of them are handled.
+	   If an END coordinate is consumed, the next interval (from the corresponding file) is read.
+	 */
+	CHRPOS ConsumeNextCoordinate();
+
+	/*
+	   Updates the coverage information based on the given item.
+	   Item can be a START coordinate or an END coordiante.
+	 */
+	void UpdateInformation(const IntervalItem &item);
+
+	/*
+	   prints chrom/start/end and the current depth coverage values of all the files.
+	 */
+	void PrintCoverage(CHRPOS start, CHRPOS end);
+
+	/*
+	   prints chrom/start/end and the ZERO depth coverage values of all the files.
+	 */
+	void PrintEmptyCoverage(CHRPOS start, CHRPOS end);
+
+	void DebugPrintQueue();
+};
+
+
+#endif
diff --git a/src/unionBedGraphs/unionBedGraphsMain.cpp b/src/unionBedGraphs/unionBedGraphsMain.cpp
new file mode 100644
index 00000000..15f80709
--- /dev/null
+++ b/src/unionBedGraphs/unionBedGraphsMain.cpp
@@ -0,0 +1,271 @@
+/*****************************************************************************
+  unionBedGraphsMain.cpp
+
+  (c) 2010 - Assaf Gordon, CSHL
+           - Aaron Quinlan, UVA
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0+ license.
+******************************************************************************/
+#include <climits>
+#include <cstring>
+#include <cstdlib>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <getopt.h>
+#include <libgen.h> //for basename()
+#include "version.h"
+
+#include "genomeFile.h"
+#include "unionBedGraphs.h"
+
+using namespace std;
+
+// define our program name
+#define PROGRAM_NAME "unionBedGraphs"
+
+// define our parameter checking macro
+#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
+
+//STLized version of basename()
+// (because POSIX basename() modifies the input string pointer)
+// Additionally: removes any extension the basename might have.
+std::string stl_basename(const std::string& path);
+
+// function declarations
+void ShowHelp(void);
+void ShowExamples(void);
+
+
+int main(int argc, char* argv[])
+{
+	bool haveFiles         = false;
+	bool haveTitles        = false;
+    bool haveGenome        = false;
+    bool haveFiller        = true;
+	bool printHeader       = false;
+	bool printEmptyRegions = false;
+    bool showHelp          = false;
+	string genomeFile;
+	string basePath;
+	string noCoverageValue = "0";
+	vector<string> inputFiles;
+	vector<string> inputTitles;
+
+	//Parse command line options
+	if(argc <= 1)
+		ShowHelp();
+
+	for(int i = 1; i < argc; i++) {
+		int parameterLength = (int)strlen(argv[i]);
+
+		if((PARAMETER_CHECK("-h", 2, parameterLength)) || 
+		(PARAMETER_CHECK("--help", 5, parameterLength))) {
+			showHelp = true;
+		}
+	}
+
+	if(showHelp == true) {
+	    ShowHelp();
+        exit(1);
+    }
+
+	// do some parsing (all of these parameters require 2 strings)
+	for(int i = 1; i < argc; i++) {
+
+		int parameterLength = (int)strlen(argv[i]);
+
+ 		if(PARAMETER_CHECK("-i", 2, parameterLength)) {
+			if ((i+1) < argc) {
+				haveFiles = true;
+                i = i+1;
+                string file = argv[i];
+				while (file[0] != '-' && i < argc) {
+                    inputFiles.push_back(file);
+                    i++;
+                    if (i < argc)
+                        file = argv[i];
+				}
+                i--;
+			}
+		}
+		else if(PARAMETER_CHECK("-names", 6, parameterLength)) {
+			if ((i+1) < argc) {
+				haveTitles = true;
+                i = i+1;
+                string title = argv[i];
+				while (title[0] != '-' && i < argc) {
+                    inputTitles.push_back(title);
+                    i++;
+                    if (i < argc)
+                        title = argv[i];
+				}
+                i--;
+			}
+		}
+		else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
+			if ((i+1) < argc) {
+				haveGenome = true;
+				genomeFile = argv[i + 1];
+				i++;
+			}
+		}
+		else if(PARAMETER_CHECK("-filler", 7, parameterLength)) {
+			if ((i+1) < argc) {
+				haveFiller      = true;
+				noCoverageValue = argv[i + 1];
+				i++;
+			}
+		}
+		else if(PARAMETER_CHECK("-header", 7, parameterLength)) {
+			printHeader = true;
+		}
+		else if(PARAMETER_CHECK("-examples", 9, parameterLength)) {
+            ShowHelp();
+            ShowExamples();
+            exit(1);
+		}			
+	}
+
+	UnionBedGraphs mbg(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue);
+	if (printHeader)
+		mbg.PrintHeader();
+	mbg.Union();
+}
+
+void ShowHelp(void) {
+	
+	cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
+	
+	cerr << "Authors: Assaf Gordon, CSHL" << endl;
+	cerr << "         Aaron Quinlan (aaronquinlan@gmail.com)" << endl << endl;
+
+    cerr << "Summary: Combines multiple BedGraph files into a single file," << endl;
+    cerr << "\t allowing coverage comparisons between them." << endl << endl;
+
+    cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -i FILE1 FILE2 .. FILEn" << endl << endl;
+
+	cerr << "Options: " << endl;
+	
+	cerr << "\t-header\t\t"		<< "Print a header line." << endl;
+	cerr                        << "\t\t\t(chrom/start/end + names of each file)." << endl << endl;	
+	
+	cerr << "\t-names\t\t"	    << "A list of names (one / file) to describe each file in -i." << endl;
+	cerr 						<< "\t\t\tThese names will be printed in the header line." << endl << endl;
+
+	cerr << "\t-g\t\t"	     	<< "Use genome file to calculate empty regions." << endl;
+	cerr 						<< "\t\t\t- STRING." << endl << endl;
+	
+	cerr << "\t-empty\t\t"		<< "Report empty regions (i.e., start/end intervals w/o" << endl;
+	cerr 						<< "\t\t\tvalues in all files)." << endl;
+	cerr 						<< "\t\t\t- Requires the '-g FILE' parameter.\n" << endl;
+
+	cerr << "\t-filler TEXT\t"  << "Use TEXT when representing intervals having no value." << endl;
+    cerr			            << "\t\t\t- Default is '0', but you can use 'N/A' or any other text." << endl << endl;
+    
+	cerr << "\t-examples\t"     << "Show detailed usage examples." << endl << endl;
+}
+
+
+
+void ShowExamples()
+{
+	cerr << "Example usage:\n\n"  \
+"== Input files: ==\n" \
+"\n" \
+" $ cat 1.bg\n" \
+" chr1	1000	1500	10\n" \
+" chr1	2000	2100	20\n" \
+"\n" \
+" $ cat 2.bg\n" \
+" chr1	900	1600	60\n" \
+" chr1	1700	2050	50\n" \
+"\n" \
+" $ cat 3.bg\n" \
+" chr1	1980	2070	80\n" \
+" chr1	2090	2100	20\n" \
+"\n" \
+" $ cat sizes.txt\n" \
+" chr1	5000\n" \
+"\n" \
+"== Union/combine the files: ==\n" \
+"\n" \
+" $ unionBedGraphs -i 1.bg 2.bg 3.bg\n" \
+" chr1	900	1000	0	60	0\n" \
+" chr1	1000	1500	10	60	0\n" \
+" chr1	1500	1600	0	60	0\n" \
+" chr1	1700	1980	0	50	0\n" \
+" chr1	1980	2000	0	50	80\n" \
+" chr1	2000	2050	20	50	80\n" \
+" chr1	2050	2070	20	0	80\n" \
+" chr1	2070	2090	20	0	0\n" \
+" chr1	2090	2100	20	0	20\n" \
+"\n" \
+"== Union/combine the files, with a header line (titles are the file names): ==\n" \
+"\n" \
+" $ unionBedGraphs -header -i 1.bg 2.bg 3.bg\n" \
+" chrom	start	end	1	2	3\n" \
+" chr1	900	1000	0	60	0\n" \
+" chr1	1000	1500	10	60	0\n" \
+" chr1	1500	1600	0	60	0\n" \
+" chr1	1700	1980	0	50	0\n" \
+" chr1	1980	2000	0	50	80\n" \
+" chr1	2000	2050	20	50	80\n" \
+" chr1	2050	2070	20	0	80\n" \
+" chr1	2070	2090	20	0	0\n" \
+" chr1	2090	2100	20	0	20\n" \
+"\n" \
+"== Union/combine the files, with a header line and custom names: ==\n" \
+"\n" \
+" $ unionBedGraphs -header -i 1.bg 2.bg 3.bg -names WT-1 WT-2 KO-1\n" \
+" chrom	start	end	WT-1	WT-2	KO-1\n" \
+" chr1	900	1000	0	60	0\n" \
+" chr1	1000	1500	10	60	0\n" \
+" chr1	1500	1600	0	60	0\n" \
+" chr1	1700	1980	0	50	0\n" \
+" chr1	1980	2000	0	50	80\n" \
+" chr1	2000	2050	20	50	80\n" \
+" chr1	2050	2070	20	0	80\n" \
+" chr1	2070	2090	20	0	0\n" \
+" chr1	2090	2100	20	0	20\n" \
+"\n" \
+"== Union/combine, showing empty regions (note, requires -g): ==\n" \
+"\n" \
+" $ unionBedGraphs -header -empty -g sizes.TXT -i 1.bg 2.bg 3.bg\n" \
+" chrom	start	end	1	2	3\n" \
+" chr1	0	900	0	0	0\n" \
+" chr1	900	1000	0	60	0\n" \
+" chr1	1000	1500	10	60	0\n" \
+" chr1	1500	1600	0	60	0\n" \
+" chr1	1600	1700	0	0	0\n" \
+" chr1	1700	1980	0	50	0\n" \
+" chr1	1980	2000	0	50	80\n" \
+" chr1	2000	2050	20	50	80\n" \
+" chr1	2050	2070	20	0	80\n" \
+" chr1	2070	2090	20	0	0\n" \
+" chr1	2090	2100	20	0	20\n" \
+" chr1	2100	5000	0	0	0\n" \
+"\n" \
+;
+}
+
+std::string stl_basename(const std::string& path)
+{
+	string result;
+
+	char* path_dup = strdup(path.c_str());
+	char* basename_part = basename(path_dup);
+	result = basename_part;
+	free(path_dup);
+
+	size_t pos = result.find_last_of('.');
+	if (pos != string::npos )
+		result = result.substr(0,pos);
+
+	return result;
+}
+
diff --git a/src/utils/fileType/fileType.cpp b/src/utils/fileType/fileType.cpp
index 3d04675f..a0abd3e7 100644
--- a/src/utils/fileType/fileType.cpp
+++ b/src/utils/fileType/fileType.cpp
@@ -12,8 +12,14 @@
 
 #include "fileType.h"
 
-bool isRegularFile(const string& filename)
-{
+
+/*
+   returns TRUE if the file is a regular file:
+     not a pipe/device.
+
+   This implies that the file can be opened/closed/seek'd multiple times without losing information
+ */
+bool isRegularFile(const string& filename) {
        struct stat buf ;
        int i;
 
@@ -28,8 +34,12 @@ bool isRegularFile(const string& filename)
        return false;
 }
 
-bool isGzipFile(const string& filename)
-{
+
+/*
+   returns TRUE if the file has a GZIP header.
+   Should only be run on regular files.
+ */
+bool isGzipFile(const string& filename) {
        //see http://www.gzip.org/zlib/rfc-gzip.html#file-format
        struct  {
                unsigned char id1;
diff --git a/src/utils/fileType/fileType.h b/src/utils/fileType/fileType.h
index cd9d1457..0a5f818f 100644
--- a/src/utils/fileType/fileType.h
+++ b/src/utils/fileType/fileType.h
@@ -19,6 +19,9 @@
 #include <string.h>
 #include <errno.h>
 #include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sstream>
 
 using namespace std;
 
@@ -28,6 +31,7 @@ using namespace std;
   
   Kindly contributed by Assaf Gordon.
 ******************************************************************************/
+string string_error(int errnum);
 bool isRegularFile(const string& filename);
 bool isGzipFile(const string& filename);
 
diff --git a/src/utils/tabFile/tabFile.cpp b/src/utils/tabFile/tabFile.cpp
index 323e8516..6da08fee 100644
--- a/src/utils/tabFile/tabFile.cpp
+++ b/src/utils/tabFile/tabFile.cpp
@@ -88,10 +88,10 @@ TabLineStatus TabFile::GetNextTabLine(TAB_FIELDS &tabFields, int &lineNum) {
 		lineNum++;
 
 		// split into a string vector.
-		Tokenize(tabLine,tabFields);
-		
-        // parse the line and validate it.
-		return parseTabLine(tabFields, lineNum);
+		Tokenize(tabLine, tabFields);
+
+        // parse the line and validate it
+		return parseTabLine(tabFields, lineNum);		
 	}
 	
 	// default if file is closed or EOF
diff --git a/src/utils/tabFile/tabFile.h b/src/utils/tabFile/tabFile.h
index 00c70063..9dc3bb55 100644
--- a/src/utils/tabFile/tabFile.h
+++ b/src/utils/tabFile/tabFile.h
@@ -64,8 +64,9 @@ private:
     	if (lineVector.size() == 0) 
     		return TAB_BLANK;
         // real line with data
-    	if (lineVector[0].find("#") == string::npos)
+    	if (lineVector[0][0] != '#') {
             return TAB_VALID;
+        }
         // comment or header line
     	else {
     		lineNum--;
-- 
GitLab