diff --git a/Makefile b/Makefile index c962907278a1b1dad7b8c0c2c17c3f4622244ee9..6cb245838fe8cb48a179f5dbb98c6b9503d1ad08 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,39 @@ export CXX = g++ export CXXFLAGS = -Wall -O2 export LIBS = -lz -SUBDIRS = $(SRC_DIR)/bamToBed $(SRC_DIR)/bedToBam $(SRC_DIR)/bedToIgv $(SRC_DIR)/bed12ToBed6 $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/groupBy $(SRC_DIR)/intersectBed $(SRC_DIR)/linksBed $(SRC_DIR)/maskFastaFromBed $(SRC_DIR)/mergeBed $(SRC_DIR)/overlap $(SRC_DIR)/pairToBed $(SRC_DIR)/pairToPair $(SRC_DIR)/shuffleBed $(SRC_DIR)/slopBed $(SRC_DIR)/sortBed $(SRC_DIR)/subtractBed $(SRC_DIR)/windowBed +SUBDIRS = $(SRC_DIR)/bamToBed \ + $(SRC_DIR)/bedToBam \ + $(SRC_DIR)/bedToIgv \ + $(SRC_DIR)/bed12ToBed6 \ + $(SRC_DIR)/closestBed \ + $(SRC_DIR)/complementBed \ + $(SRC_DIR)/coverageBed \ + $(SRC_DIR)/fastaFromBed \ + $(SRC_DIR)/genomeCoverageBed \ + $(SRC_DIR)/groupBy \ + $(SRC_DIR)/intersectBed \ + $(SRC_DIR)/linksBed \ + $(SRC_DIR)/maskFastaFromBed \ + $(SRC_DIR)/mergeBed \ + $(SRC_DIR)/overlap \ + $(SRC_DIR)/pairToBed \ + $(SRC_DIR)/pairToPair \ + $(SRC_DIR)/shuffleBed \ + $(SRC_DIR)/slopBed \ + $(SRC_DIR)/sortBed \ + $(SRC_DIR)/subtractBed \ + $(SRC_DIR)/unionBedGraphs \ + $(SRC_DIR)/windowBed -UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/tabFile $(SRC_DIR)/utils/genomeFile $(SRC_DIR)/utils/gzstream $(SRC_DIR)/utils/fileType $(SRC_DIR)/utils/bedFilePE $(SRC_DIR)/utils/sequenceUtilities $(SRC_DIR)/utils/BamTools +UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities \ + $(SRC_DIR)/utils/bedFile \ + $(SRC_DIR)/utils/tabFile \ + $(SRC_DIR)/utils/genomeFile \ + $(SRC_DIR)/utils/gzstream \ + $(SRC_DIR)/utils/fileType \ + $(SRC_DIR)/utils/bedFilePE \ + $(SRC_DIR)/utils/sequenceUtilities \ + $(SRC_DIR)/utils/BamTools all: @@ -33,7 +63,6 @@ all: done - .PHONY: all clean: diff --git a/src/genomeCoverageBed/genomeCoverageMain.cpp b/src/genomeCoverageBed/genomeCoverageMain.cpp index 40c33931088a59e43dbb4f7452a90925531a7530..d1be3bacd81f9fdfec2df97f602c963e1dd869b3 100644 --- a/src/genomeCoverageBed/genomeCoverageMain.cpp +++ b/src/genomeCoverageBed/genomeCoverageMain.cpp @@ -159,7 +159,7 @@ void ShowHelp(void) { cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; cerr << "Authors: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << " Gordon Assaf, CSHL" << endl << endl; + cerr << " Assaf Gordon, CSHL" << endl << endl; cerr << "Summary: Compute the coverage of a feature file among a genome." << endl << endl; diff --git a/src/groupBy/groupBy.cpp b/src/groupBy/groupBy.cpp index 15c331d57748a15cf7ef52232cb21e6284d6e840..000e068718789265019e0bf53bb1e54b8973b7cb 100644 --- a/src/groupBy/groupBy.cpp +++ b/src/groupBy/groupBy.cpp @@ -1,5 +1,5 @@ /***************************************************************************** - overlap.cpp + groupBy.cpp (c) 2009 - Aaron Quinlan Hall Laboratory @@ -21,6 +21,8 @@ #include <math.h> #include <limits.h> #include <string.h> +#include <exception> +#include <stdexcept> // out_of_range exception #include "version.h" #include "lineFileUtilities.h" @@ -36,11 +38,13 @@ using namespace std; // function declarations void ShowHelp(void); -void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn, string op); -void ReportSummary(const vector<string> &group, const vector<string> &data, string op); +void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops); +void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops); +void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum); float ToFloat (string element); double ToDouble(const string &element); -void TabPrint (string element); +void TabPrintPost (string element); +void TabPrintPre (string element); void CommaPrint (string element); int main(int argc, char* argv[]) { @@ -48,15 +52,15 @@ int main(int argc, char* argv[]) { // input files string inFile; string groupColumnsString = "1,2,3"; - string opColumnString; - string op = "sum"; + string opsColumnString; + string opsString; // our configuration variables - bool showHelp = false; - bool haveInFile = false; - bool haveGroupColumns = false; - bool haveOpColumn = false; - bool haveOp = true; + bool showHelp = false; + bool haveInFile = false; + bool haveGroupColumns = false; + bool haveOpColumns = false; + bool haveOps = true; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -89,14 +93,14 @@ int main(int argc, char* argv[]) { groupColumnsString = argv[i + 1]; i++; } - else if(PARAMETER_CHECK("-opCol", 6, parameterLength)) { - haveOpColumn = true; - opColumnString = argv[i + 1]; + else if(PARAMETER_CHECK("-opCols", 7, parameterLength)) { + haveOpColumns = true; + opsColumnString = argv[i + 1]; i++; } - else if(PARAMETER_CHECK("-op", 3, parameterLength)) { - haveOp = true; - op = argv[i + 1]; + else if(PARAMETER_CHECK("-ops", 4, parameterLength)) { + haveOps = true; + opsString = argv[i + 1]; i++; } else { @@ -110,16 +114,21 @@ int main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Need -i file. " << endl << "*****" << endl; showHelp = true; } - if (!haveOpColumn) { + if (!haveOpColumns) { cerr << endl << "*****" << endl << "*****ERROR: Need -opCol." << endl << "*****" << endl; showHelp = true; } - if ((op != "sum") && (op != "max") && (op != "min") && (op != "mean") && - (op != "mode") && (op != "median") && (op != "antimode") && (op != "stdev") && - (op != "count") && (op != "collapse")) { - cerr << endl << "*****" << endl << "*****ERROR: Invalid op selection. " << endl << "*****" << endl; - showHelp = true; - } + // split the opsString into discrete operations and make sure they are all valid. + vector<string> ops; + Tokenize(opsString, ops, ","); + for( size_t i = 0; i < ops.size(); i++ ) { + if ((ops[i] != "sum") && (ops[i] != "max") && (ops[i] != "min") && (ops[i] != "mean") && + (ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") && + (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse")) { + cerr << endl << "*****" << endl << "*****ERROR: Invalid op selection \"" << ops[i] << endl << "*****" << endl; + showHelp = true; + } + } if (!showHelp) { // Split the column string sent by the user into discrete column numbers @@ -127,25 +136,33 @@ int main(int argc, char* argv[]) { vector<int> groupColumnsInt; Tokenize(groupColumnsString, groupColumnsInt, ","); - // sanity check the group columns - vector<int>::const_iterator gIt = groupColumnsInt.begin(); - vector<int>::const_iterator gEnd = groupColumnsInt.end(); - for (; gIt != gEnd; ++gIt) { - if (*gIt < 1) { - cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl; - ShowHelp(); + vector<int> opColumnsInt; + Tokenize(opsColumnString, opColumnsInt, ","); + + // sanity check the group columns + for(size_t i = 0; i < groupColumnsInt.size(); ++i) { + int groupColumnInt = groupColumnsInt[i]; + if (groupColumnInt < 1) { + cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl; + ShowHelp(); } } // sanity check the op columns - int opColumnInt = atoi(opColumnString.c_str()); - if (opColumnInt < 1) { - cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl; - ShowHelp(); + for(size_t i = 0; i < opColumnsInt.size(); ++i) { + int opColumnInt = opColumnsInt[i]; + if (opColumnInt < 1) { + cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl; + ShowHelp(); + } } - - //DetermineInput(inFile, groupColumnsInt, opColumnInt, op); - GroupBy(inFile, groupColumnsInt, opColumnInt, op); + + // sanity check that there are equal number of opColumns and ops + if (ops.size() != opColumnsInt.size()) { + cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl; + ShowHelp(); + } + GroupBy(inFile, groupColumnsInt, opColumnsInt, ops); } else { ShowHelp(); @@ -201,7 +218,7 @@ void ShowHelp(void) { } -void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn, string op) { +void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops) { // current line number int lineNum = 0; @@ -216,9 +233,11 @@ void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn vector<string> prevGroup(0); vector<string> currGroup(0); - // vector of the opColumn values for the current group - vector<string> values; - values.reserve(100000); + // vector (one per column) of vector (one per value/column) of the opColumn values for the current group + vector< vector<string> > values; + for( size_t i = 0; i < opColumns.size(); i++ ) { + values.push_back( vector<string>() ); + } // check the status of the current line TabLineStatus tabLineStatus; @@ -230,154 +249,177 @@ void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn _tab->Open(); while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) { if (tabLineStatus == TAB_VALID) { - - // grab the number of columns in the line. - int lineWidth = inFields.size(); - - // sanity check the op column - if (opColumn > lineWidth) { - cerr << endl << "*****" << endl << "*****ERROR: op column exceeds the number of columns in file at line " - << lineNum << ". Exiting." << endl << "*****" << endl; - exit(1); - } - // build the group vector for the current line currGroup.clear(); vector<int>::const_iterator gIt = groupColumns.begin(); vector<int>::const_iterator gEnd = groupColumns.end(); - for (; gIt != gEnd; ++gIt) { - if (*gIt > lineWidth) { - cerr << endl << "*****" << endl << "*****ERROR: group column exceeds the number of columns in file at line " - << lineNum << ". Exiting." << endl << "*****" << endl; - exit(1); - } - currGroup.push_back(inFields[*gIt-1]); - } + for (; gIt != gEnd; ++gIt) + addValue(inFields, currGroup, (*gIt-1), lineNum); // there has been a group change if ((currGroup != prevGroup) && (prevGroup.size() > 0)) { // Summarize this group - ReportSummary(prevGroup, values, op); + ReportSummary(prevGroup, values, ops); + // reset and add the first value for the next group. values.clear(); - values.push_back(inFields[opColumn-1].c_str()); + for( size_t i = 0; i < opColumns.size(); i++ ) { + values.push_back( vector<string>() ); + addValue(inFields, values[i], opColumns[i]-1, lineNum); + } } // we're still dealing with the same group - else - values.push_back(inFields[opColumn-1].c_str()); - + else { + for( size_t i = 0; i < opColumns.size(); i++ ) + addValue(inFields, values[i], opColumns[i]-1, lineNum); + } // reset for the next line prevGroup = currGroup; - inFields.clear(); } + inFields.clear(); } // report the last group - ReportSummary(currGroup, values, op); + ReportSummary(currGroup, values, ops); _tab->Close(); } -void ReportSummary(const vector<string> &group, const vector<string> &data, string op) { +void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops) { - vector<double> dataF; - // are we doing a numeric conversion? if so, convert the strings to doubles. - if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") || - (op == "median") || (op == "stdev")) { - transform(data.begin(), data.end(), back_inserter(dataF), ToDouble); - } + vector<string> result; + for( size_t i = 0; i < data.size(); i++ ) { + + string op = ops[i]; + std::stringstream buffer; + vector<double> dataF; + // are we doing a numeric conversion? if so, convert the strings to doubles. + if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") || + (op == "median") || (op == "stdev") || (op == "sstdev")) + { + transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble); + } - if (op == "sum") { - // sum them up - double total = accumulate(dataF.begin(), dataF.end(), 0.0); - for_each(group.begin(), group.end(), TabPrint); - cout << setprecision (7) << total << endl; - } - if (op == "collapse") { - for_each(group.begin(), group.end(), TabPrint); - for_each(data.begin(), data.end(), CommaPrint); - cout << endl; - } - else if (op == "min") { - for_each(group.begin(), group.end(), TabPrint); - cout << setprecision (7) << *min_element( dataF.begin(), dataF.end() ) << endl; - } - else if (op == "max") { - for_each(group.begin(), group.end(), TabPrint); - cout << setprecision (7) << *max_element( dataF.begin(), dataF.end() ) << endl; - } - else if (op == "mean") { - double total = accumulate(dataF.begin(), dataF.end(), 0.0); - double mean = total / dataF.size(); - for_each(group.begin(), group.end(), TabPrint); - cout << setprecision (7) << mean << endl; - } - else if (op == "median") { - double median = 0.0; - sort(dataF.begin(), dataF.end()); - int totalLines = dataF.size(); - if ((totalLines % 2) > 0) { - long mid; - mid = totalLines / 2; - median = dataF[mid]; - } - else { - long midLow, midHigh; - midLow = (totalLines / 2) - 1; - midHigh = (totalLines / 2); - median = (dataF[midLow] + dataF[midHigh]) / 2.0; - } - for_each(group.begin(), group.end(), TabPrint); - cout << setprecision (7) << median << endl; - } - else if (op == "count") { - for_each(group.begin(), group.end(), TabPrint); - cout << setprecision (7) << data.size() << endl; - } - else if ((op == "mode") || (op == "antimode")) { - // compute the frequency of each unique value - map<string, int> freqs; - vector<string>::const_iterator dIt = data.begin(); - vector<string>::const_iterator dEnd = data.end(); - for (; dIt != dEnd; ++dIt) { - freqs[*dIt]++; + if (op == "sum") { + // sum them up + double total = accumulate(dataF.begin(), dataF.end(), 0.0); + buffer << setprecision (7) << total; + result.push_back(buffer.str()); } - - // grab the mode and the anti mode - string mode, antiMode; - int count = 0; - int minCount = INT_MAX; - for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) { - if (iter->second > count) { - mode = iter->first; - count = iter->second; - } - if (iter->second < minCount) { - antiMode = iter->first; - minCount = iter->second; - } - } - // report - for_each(group.begin(), group.end(), TabPrint); - if (op == "mode") - cout << setprecision (7) << mode << endl; - else if (op == "antimode") - cout << setprecision (7) << antiMode << endl; - } - else if (op == "stdev") { - // get the mean - double total = accumulate(dataF.begin(), dataF.end(), 0.0); - double mean = total / dataF.size(); - // get the variance - double totalVariance = 0.0; - vector<double>::const_iterator dIt = dataF.begin(); - vector<double>::const_iterator dEnd = dataF.end(); - for (; dIt != dEnd; ++dIt) { - totalVariance += pow((*dIt - mean),2); + if (op == "collapse") { + string collapse; + for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter + collapse.append(data[i][j]); + collapse.append(","); + } + result.push_back(collapse); + } + else if (op == "min") { + buffer << setprecision (7) << *min_element( dataF.begin(), dataF.end() ); + result.push_back(buffer.str()); + } + else if (op == "max") { + buffer << setprecision (7) << *max_element( dataF.begin(), dataF.end() ); + result.push_back(buffer.str()); + } + else if (op == "mean") { + double total = accumulate(dataF.begin(), dataF.end(), 0.0); + double mean = total / dataF.size(); + buffer << setprecision (7) << mean; + result.push_back(buffer.str()); + } + else if (op == "median") { + double median = 0.0; + sort(dataF.begin(), dataF.end()); + int totalLines = dataF.size(); + if ((totalLines % 2) > 0) { + long mid; + mid = totalLines / 2; + median = dataF[mid]; + } + else { + long midLow, midHigh; + midLow = (totalLines / 2) - 1; + midHigh = (totalLines / 2); + median = (dataF[midLow] + dataF[midHigh]) / 2.0; + } + buffer << setprecision (7) << median; + result.push_back(buffer.str()); + } + else if (op == "count") { + buffer << setprecision (7) << data[i].size(); + result.push_back(buffer.str()); + } + else if ((op == "mode") || (op == "antimode")) { + // compute the frequency of each unique value + map<string, int> freqs; + vector<string>::const_iterator dIt = data[i].begin(); + vector<string>::const_iterator dEnd = data[i].end(); + for (; dIt != dEnd; ++dIt) { + freqs[*dIt]++; + } + + // grab the mode and the anti mode + string mode, antiMode; + int count = 0; + int minCount = INT_MAX; + for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) { + if (iter->second > count) { + mode = iter->first; + count = iter->second; + } + if (iter->second < minCount) { + antiMode = iter->first; + minCount = iter->second; + } + } + // report + if (op == "mode") { + buffer << setprecision (7) << mode; + result.push_back(buffer.str()); + } + else if (op == "antimode") { + buffer << setprecision (7) << antiMode; + result.push_back(buffer.str()); + } + } + else if (op == "stdev" || op == "sstdev") { + // get the mean + double total = accumulate(dataF.begin(), dataF.end(), 0.0); + double mean = total / dataF.size(); + // get the variance + double totalVariance = 0.0; + vector<double>::const_iterator dIt = dataF.begin(); + vector<double>::const_iterator dEnd = dataF.end(); + for (; dIt != dEnd; ++dIt) { + totalVariance += pow((*dIt - mean),2); + } + double variance = 0.0; + if (op == "stdev") { + variance = totalVariance / dataF.size(); + } + else if (op == "sstdev" && dataF.size() > 1) { + variance = totalVariance / (dataF.size() - 1); + } + double stddev = sqrt(variance); + // report + buffer << setprecision (7) << stddev; + result.push_back(buffer.str()); } - double variance = totalVariance / dataF.size(); - double stddev = sqrt(variance); - // report - for_each(group.begin(), group.end(), TabPrint); - cout << setprecision (7) << stddev << endl; + } + for_each(group.begin(), group.end(), TabPrintPost); + cout << *result.begin(); + for_each(++result.begin(), result.end(), TabPrintPre); + cout << endl; //Gets rid of extraneous tab +} + + +void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum) { + try { + toList.push_back(fromList.at(index)); + } + catch(std::out_of_range& e) { + cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line " + << lineNum << ". Exiting." << endl << "*****" << endl; + exit(1); } } @@ -386,10 +428,14 @@ float ToFloat (string element) { return atof(element.c_str()); } -void TabPrint (string element) { +void TabPrintPost (string element) { cout << element << "\t"; } +void TabPrintPre (string element) { + cout << "\t" << element; +} + void CommaPrint (string element) { cout << element << ","; } diff --git a/src/unionBedGraphs/Makefile b/src/unionBedGraphs/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..00b488a6d3de3eb48ca097e055d47ae555edd0db --- /dev/null +++ b/src/unionBedGraphs/Makefile @@ -0,0 +1,48 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedGraphFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= unionBedGraphs.cpp unionBedGraphsMain.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedGraphFile.o genomeFile.o lineFileUtilities.o gzstream.o fileType.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= unionBedGraphs + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedGraphFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/genomeFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/unionBedGraphs/intervalItem.h b/src/unionBedGraphs/intervalItem.h new file mode 100644 index 0000000000000000000000000000000000000000..f356ab831027a43bdfc78f607cd29188a2d0b6af --- /dev/null +++ b/src/unionBedGraphs/intervalItem.h @@ -0,0 +1,62 @@ +/***************************************************************************** + intervalItem.h + + (c) 2010 - Assaf Gordon + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ +#ifndef INTERVALITEM_H +#define INTERVALITEM_H + +#include <string> +#include <queue> + +enum COORDINATE_TYPE { + START, + END +}; + +/* + An interval item in the priority queue. + + An IntervalItem can mark either a START position or an END position. + */ +class IntervalItem +{ +private: + IntervalItem(); + +public: + int source_index; // which source BedGraph file this came from + COORDINATE_TYPE coord_type; // is this the start or the end position? + CHRPOS coord; + std::string depth; + + IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord, std::string _depth) : + source_index(_index), + coord_type(_type), + coord(_coord), + depth(_depth) + {} + + IntervalItem(const IntervalItem &other) : + source_index(other.source_index), + coord_type(other.coord_type), + coord(other.coord), + depth(other.depth) + {} + + bool operator< ( const IntervalItem& other ) const + { + return this->coord > other.coord; + } +}; + +// our priority queue +typedef std::priority_queue<IntervalItem> INTERVALS_PRIORITY_QUEUE; + +#endif diff --git a/src/unionBedGraphs/unionBedGraphs.cpp b/src/unionBedGraphs/unionBedGraphs.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5e1290459f06a87ab04a1b0ee76998d2088d4ab6 --- /dev/null +++ b/src/unionBedGraphs/unionBedGraphs.cpp @@ -0,0 +1,255 @@ +/***************************************************************************** + unionBedGraphs.cpp + + (c) 2010 - Assaf Gordon, CSHL + - Aaron Quinlan, UVA + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ +#include <cassert> +#include <cstring> +#include <cstdlib> +#include <iostream> +#include <algorithm> + +#include "bedGraphFile.h" +#include "unionBedGraphs.h" + +using namespace std; + + +UnionBedGraphs::UnionBedGraphs(std::ostream& _output, + const vector<string>& _filenames, + const vector<string>& _titles, + bool _print_empty_regions, + const std::string& _genome_size_filename, + const std::string& _no_coverage_value ) : + filenames(_filenames), + titles(_titles), + output(_output), + current_non_zero_inputs(0), + print_empty_regions(_print_empty_regions), + genome_sizes(NULL), + no_coverage_value(_no_coverage_value) +{ + if (print_empty_regions) { + assert(!_genome_size_filename.empty()); + + genome_sizes = new GenomeFile(_genome_size_filename); + } +} + + +UnionBedGraphs::~UnionBedGraphs() { + CloseBedgraphFiles(); + if (genome_sizes) { + delete genome_sizes; + genome_sizes = NULL ; + } +} + + +void UnionBedGraphs::Union() { + OpenBedgraphFiles(); + + // Add the first interval from each file + for(size_t i=0;i<bedgraph_files.size();++i) + LoadNextBedgraphItem(i); + + // Chromosome loop - once per chromosome + do { + // Find the first chromosome to use + current_chrom = DetermineNextChrom(); + + // Populate the queue with initial values from all files + // (if they belong to the correct chromosome) + for(size_t i=0;i<bedgraph_files.size();++i) + AddInterval(i); + + CHRPOS current_start = ConsumeNextCoordinate(); + + // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage + if (print_empty_regions && current_start > 0) + PrintEmptyCoverage(0,current_start); + + // Intervals loop - until all intervals (of current chromosome) from all files are used. + do { + CHRPOS current_end = queue.top().coord; + PrintCoverage(current_start, current_end); + current_start = ConsumeNextCoordinate(); + } while (!queue.empty()); + + // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome + // print a dummy empty coverage + if (print_empty_regions) { + CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); + if (current_start < chrom_size) + PrintEmptyCoverage(current_start, chrom_size); + } + + } while (!AllFilesDone()); +} + + +CHRPOS UnionBedGraphs::ConsumeNextCoordinate() { + assert(!queue.empty()); + + CHRPOS new_position = queue.top().coord; + do { + IntervalItem item = queue.top(); + UpdateInformation(item); + queue.pop(); + } while (!queue.empty() && queue.top().coord == new_position); + + return new_position; +} + + +void UnionBedGraphs::UpdateInformation(const IntervalItem &item) { + // Update the depth coverage for this file + + // Which coordinate is it - start or end? + switch (item.coord_type) + { + case START: + current_depth[item.source_index] = item.depth; + current_non_zero_inputs++; + break; + case END: + //Read the next interval from this file + AddInterval(item.source_index); + current_depth[item.source_index] = no_coverage_value; + current_non_zero_inputs--; + break; + default: + assert(0); + } +} + + +void UnionBedGraphs::PrintHeader() { + output << "chrom\tstart\tend" ; + for (size_t i=0;i<titles.size();++i) + output << "\t" <<titles[i]; + output << endl; +} + + +void UnionBedGraphs::PrintCoverage(CHRPOS start, CHRPOS end) { + if ( current_non_zero_inputs == 0 && ! print_empty_regions ) + return ; + + output << current_chrom << "\t" + << start << "\t" + << end; + + for (size_t i=0;i<current_depth.size();++i) + output << "\t" << current_depth[i] ; + + output << endl; +} + + +void UnionBedGraphs::PrintEmptyCoverage(CHRPOS start, CHRPOS end) { + output << current_chrom << "\t" + << start << "\t" + << end; + + for (size_t i=0;i<current_depth.size();++i) + output << "\t" << no_coverage_value ; + + output << endl; +} + + +void UnionBedGraphs::LoadNextBedgraphItem(int index) { + assert(static_cast<unsigned int>(index) < bedgraph_files.size()); + + current_bedgraph_item[index].chrom=""; + + BedGraphFile *file = bedgraph_files[index]; + BEDGRAPH_STR bg; + int lineNum = 0; + BedGraphLineStatus status; + + while ( (status = file->GetNextBedGraph(bg, lineNum)) != BEDGRAPH_INVALID ) { + if (status != BEDGRAPH_VALID) + continue; + + current_bedgraph_item[index] = bg ; + break; + } +} + + +bool UnionBedGraphs::AllFilesDone() { + for (size_t i=0;i<current_bedgraph_item.size();++i) + if (!current_bedgraph_item[i].chrom.empty()) + return false; + return true; +} + + +string UnionBedGraphs::DetermineNextChrom() { + string next_chrom; + for (size_t i=0;i<current_bedgraph_item.size();++i) { + if (current_bedgraph_item[i].chrom.empty()) + continue; + + if (next_chrom.empty()) + next_chrom = current_bedgraph_item[i].chrom; + else + if (current_bedgraph_item[i].chrom < next_chrom) + next_chrom = current_bedgraph_item[i].chrom ; + } + return next_chrom; +} + + +void UnionBedGraphs::AddInterval(int index) { + assert(static_cast<unsigned int>(index) < bedgraph_files.size()); + + //This file has no more intervals + if (current_bedgraph_item[index].chrom.empty()) + return ; + + //If the next interval belongs to a different chrom, don't add it + if (current_bedgraph_item[index].chrom!=current_chrom) + return ; + + const BEDGRAPH_STR &bg(current_bedgraph_item[index]); + + IntervalItem start_item(index, START, bg.start, bg.depth); + IntervalItem end_item(index, END, bg.end, bg.depth); + + queue.push(start_item); + queue.push(end_item); + + LoadNextBedgraphItem(index); +} + + +void UnionBedGraphs::OpenBedgraphFiles() { + for (size_t i=0;i<filenames.size();++i) { + BedGraphFile *file = new BedGraphFile(filenames[i]); + file->Open(); + bedgraph_files.push_back(file); + + current_depth.push_back(no_coverage_value); + } + current_bedgraph_item.resize(filenames.size()); +} + + +void UnionBedGraphs::CloseBedgraphFiles() { + for (size_t i=0;i<bedgraph_files.size();++i) { + BedGraphFile *file = bedgraph_files[i]; + delete file; + bedgraph_files[i] = NULL ; + } + bedgraph_files.clear(); +} diff --git a/src/unionBedGraphs/unionBedGraphs.h b/src/unionBedGraphs/unionBedGraphs.h new file mode 100644 index 0000000000000000000000000000000000000000..765a229cf62dafb83064856bdf7917af392c4b4d --- /dev/null +++ b/src/unionBedGraphs/unionBedGraphs.h @@ -0,0 +1,123 @@ +/***************************************************************************** + unionBedGraphs.h + + (c) 2010 - Assaf Gordon, CSHL + - Aaron Quinlan, UVA + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ +#ifndef UNIONBEDGRAPHS_H +#define UNIONBEDGRAPHS_H + +#include <vector> +#include <string> +#include "bedGraphFile.h" +#include "genomeFile.h" +#include "intervalItem.h" + +class UnionBedGraphs +{ +private: + typedef BEDGRAPH_STR BEDGRAPH_TYPE; + + vector<string> filenames; + vector<string> titles; + + vector<BedGraphFile*> bedgraph_files; + vector<BEDGRAPH_TYPE::DEPTH_TYPE> current_depth; + vector<BEDGRAPH_TYPE> current_bedgraph_item; + + std::ostream &output; + + INTERVALS_PRIORITY_QUEUE queue; + std::string current_chrom; + int current_non_zero_inputs; + bool print_empty_regions; + + GenomeFile* genome_sizes; + + std::string no_coverage_value; + +public: + UnionBedGraphs(std::ostream& _output, + const vector<string>& _filenames, + const vector<string>& _titles, + bool _print_empty_regions, + const std::string& _genomeFileName, + const std::string& _no_coverage_value); + + virtual ~UnionBedGraphs(); + + // Combines all bedgraph files + void Union(); + + // Print the header line: chrom/start/end + name of each bedgraph file. + void PrintHeader(); + + +private: + + // Open all BedGraph files, initialize "current_XXX" vectors + void OpenBedgraphFiles(); + + // Close the BedGraph files. + void CloseBedgraphFiles(); + + /* + Add an interval from BedGraph file 'index' into the queue. + will only be added if it belongs to the current chromosome. + + If the interval was added (=consumed), the next interval will be read from the file + using 'LoadNextBedgraphItem' + */ + void AddInterval(int index); + + /* + Loads the next interval from BedGraph file 'index'. + Stores it in 'current_bedgraph_item' vector. + */ + void LoadNextBedgraphItem(int index); + + /* + Scans the 'current_bedgraph_item' vector, + find the 'first' chromosome to use (different BedGraph files can start with different chromosomes). + */ + std::string DetermineNextChrom(); + + /* + Returns 'true' if ALL intervals from ALL BedGraph files were used + */ + bool AllFilesDone(); + + /* + Extract the next coordinate from the queue, and updates the current coverage information. + If multiple interval share the same coordinate values, all of them are handled. + If an END coordinate is consumed, the next interval (from the corresponding file) is read. + */ + CHRPOS ConsumeNextCoordinate(); + + /* + Updates the coverage information based on the given item. + Item can be a START coordinate or an END coordiante. + */ + void UpdateInformation(const IntervalItem &item); + + /* + prints chrom/start/end and the current depth coverage values of all the files. + */ + void PrintCoverage(CHRPOS start, CHRPOS end); + + /* + prints chrom/start/end and the ZERO depth coverage values of all the files. + */ + void PrintEmptyCoverage(CHRPOS start, CHRPOS end); + + void DebugPrintQueue(); +}; + + +#endif diff --git a/src/unionBedGraphs/unionBedGraphsMain.cpp b/src/unionBedGraphs/unionBedGraphsMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..15f80709ec45fc31b2c37832e1044d2ac5cec557 --- /dev/null +++ b/src/unionBedGraphs/unionBedGraphsMain.cpp @@ -0,0 +1,271 @@ +/***************************************************************************** + unionBedGraphsMain.cpp + + (c) 2010 - Assaf Gordon, CSHL + - Aaron Quinlan, UVA + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ +#include <climits> +#include <cstring> +#include <cstdlib> +#include <vector> +#include <string> +#include <iostream> +#include <getopt.h> +#include <libgen.h> //for basename() +#include "version.h" + +#include "genomeFile.h" +#include "unionBedGraphs.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "unionBedGraphs" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +//STLized version of basename() +// (because POSIX basename() modifies the input string pointer) +// Additionally: removes any extension the basename might have. +std::string stl_basename(const std::string& path); + +// function declarations +void ShowHelp(void); +void ShowExamples(void); + + +int main(int argc, char* argv[]) +{ + bool haveFiles = false; + bool haveTitles = false; + bool haveGenome = false; + bool haveFiller = true; + bool printHeader = false; + bool printEmptyRegions = false; + bool showHelp = false; + string genomeFile; + string basePath; + string noCoverageValue = "0"; + vector<string> inputFiles; + vector<string> inputTitles; + + //Parse command line options + if(argc <= 1) + ShowHelp(); + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp == true) { + ShowHelp(); + exit(1); + } + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + haveFiles = true; + i = i+1; + string file = argv[i]; + while (file[0] != '-' && i < argc) { + inputFiles.push_back(file); + i++; + if (i < argc) + file = argv[i]; + } + i--; + } + } + else if(PARAMETER_CHECK("-names", 6, parameterLength)) { + if ((i+1) < argc) { + haveTitles = true; + i = i+1; + string title = argv[i]; + while (title[0] != '-' && i < argc) { + inputTitles.push_back(title); + i++; + if (i < argc) + title = argv[i]; + } + i--; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-filler", 7, parameterLength)) { + if ((i+1) < argc) { + haveFiller = true; + noCoverageValue = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-header", 7, parameterLength)) { + printHeader = true; + } + else if(PARAMETER_CHECK("-examples", 9, parameterLength)) { + ShowHelp(); + ShowExamples(); + exit(1); + } + } + + UnionBedGraphs mbg(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue); + if (printHeader) + mbg.PrintHeader(); + mbg.Union(); +} + +void ShowHelp(void) { + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Authors: Assaf Gordon, CSHL" << endl; + cerr << " Aaron Quinlan (aaronquinlan@gmail.com)" << endl << endl; + + cerr << "Summary: Combines multiple BedGraph files into a single file," << endl; + cerr << "\t allowing coverage comparisons between them." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i FILE1 FILE2 .. FILEn" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-header\t\t" << "Print a header line." << endl; + cerr << "\t\t\t(chrom/start/end + names of each file)." << endl << endl; + + cerr << "\t-names\t\t" << "A list of names (one / file) to describe each file in -i." << endl; + cerr << "\t\t\tThese names will be printed in the header line." << endl << endl; + + cerr << "\t-g\t\t" << "Use genome file to calculate empty regions." << endl; + cerr << "\t\t\t- STRING." << endl << endl; + + cerr << "\t-empty\t\t" << "Report empty regions (i.e., start/end intervals w/o" << endl; + cerr << "\t\t\tvalues in all files)." << endl; + cerr << "\t\t\t- Requires the '-g FILE' parameter.\n" << endl; + + cerr << "\t-filler TEXT\t" << "Use TEXT when representing intervals having no value." << endl; + cerr << "\t\t\t- Default is '0', but you can use 'N/A' or any other text." << endl << endl; + + cerr << "\t-examples\t" << "Show detailed usage examples." << endl << endl; +} + + + +void ShowExamples() +{ + cerr << "Example usage:\n\n" \ +"== Input files: ==\n" \ +"\n" \ +" $ cat 1.bg\n" \ +" chr1 1000 1500 10\n" \ +" chr1 2000 2100 20\n" \ +"\n" \ +" $ cat 2.bg\n" \ +" chr1 900 1600 60\n" \ +" chr1 1700 2050 50\n" \ +"\n" \ +" $ cat 3.bg\n" \ +" chr1 1980 2070 80\n" \ +" chr1 2090 2100 20\n" \ +"\n" \ +" $ cat sizes.txt\n" \ +" chr1 5000\n" \ +"\n" \ +"== Union/combine the files: ==\n" \ +"\n" \ +" $ unionBedGraphs -i 1.bg 2.bg 3.bg\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +"\n" \ +"== Union/combine the files, with a header line (titles are the file names): ==\n" \ +"\n" \ +" $ unionBedGraphs -header -i 1.bg 2.bg 3.bg\n" \ +" chrom start end 1 2 3\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +"\n" \ +"== Union/combine the files, with a header line and custom names: ==\n" \ +"\n" \ +" $ unionBedGraphs -header -i 1.bg 2.bg 3.bg -names WT-1 WT-2 KO-1\n" \ +" chrom start end WT-1 WT-2 KO-1\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +"\n" \ +"== Union/combine, showing empty regions (note, requires -g): ==\n" \ +"\n" \ +" $ unionBedGraphs -header -empty -g sizes.TXT -i 1.bg 2.bg 3.bg\n" \ +" chrom start end 1 2 3\n" \ +" chr1 0 900 0 0 0\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1600 1700 0 0 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +" chr1 2100 5000 0 0 0\n" \ +"\n" \ +; +} + +std::string stl_basename(const std::string& path) +{ + string result; + + char* path_dup = strdup(path.c_str()); + char* basename_part = basename(path_dup); + result = basename_part; + free(path_dup); + + size_t pos = result.find_last_of('.'); + if (pos != string::npos ) + result = result.substr(0,pos); + + return result; +} + diff --git a/src/utils/fileType/fileType.cpp b/src/utils/fileType/fileType.cpp index 3d04675f37700aa32c55485368fcc395484e5ad5..a0abd3e7b4111e3dc6e1d83f42d0dc56280656cf 100644 --- a/src/utils/fileType/fileType.cpp +++ b/src/utils/fileType/fileType.cpp @@ -12,8 +12,14 @@ #include "fileType.h" -bool isRegularFile(const string& filename) -{ + +/* + returns TRUE if the file is a regular file: + not a pipe/device. + + This implies that the file can be opened/closed/seek'd multiple times without losing information + */ +bool isRegularFile(const string& filename) { struct stat buf ; int i; @@ -28,8 +34,12 @@ bool isRegularFile(const string& filename) return false; } -bool isGzipFile(const string& filename) -{ + +/* + returns TRUE if the file has a GZIP header. + Should only be run on regular files. + */ +bool isGzipFile(const string& filename) { //see http://www.gzip.org/zlib/rfc-gzip.html#file-format struct { unsigned char id1; diff --git a/src/utils/fileType/fileType.h b/src/utils/fileType/fileType.h index cd9d14571027a18e4f9c2db3f4bb0ce9fc2890f1..0a5f818f2f09fac64742aaf719319c9073f24627 100644 --- a/src/utils/fileType/fileType.h +++ b/src/utils/fileType/fileType.h @@ -19,6 +19,9 @@ #include <string.h> #include <errno.h> #include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <sstream> using namespace std; @@ -28,6 +31,7 @@ using namespace std; Kindly contributed by Assaf Gordon. ******************************************************************************/ +string string_error(int errnum); bool isRegularFile(const string& filename); bool isGzipFile(const string& filename); diff --git a/src/utils/tabFile/tabFile.cpp b/src/utils/tabFile/tabFile.cpp index 323e8516cf2c0c82c9b800770006af0dded57559..6da08fee5ae1e360aaed59274402bcb8b9061eb0 100644 --- a/src/utils/tabFile/tabFile.cpp +++ b/src/utils/tabFile/tabFile.cpp @@ -88,10 +88,10 @@ TabLineStatus TabFile::GetNextTabLine(TAB_FIELDS &tabFields, int &lineNum) { lineNum++; // split into a string vector. - Tokenize(tabLine,tabFields); - - // parse the line and validate it. - return parseTabLine(tabFields, lineNum); + Tokenize(tabLine, tabFields); + + // parse the line and validate it + return parseTabLine(tabFields, lineNum); } // default if file is closed or EOF diff --git a/src/utils/tabFile/tabFile.h b/src/utils/tabFile/tabFile.h index 00c700634dfc2cbe0afd7ecc169b7312ab1eee43..9dc3bb55c9efa32146e989439a997678b0f5284b 100644 --- a/src/utils/tabFile/tabFile.h +++ b/src/utils/tabFile/tabFile.h @@ -64,8 +64,9 @@ private: if (lineVector.size() == 0) return TAB_BLANK; // real line with data - if (lineVector[0].find("#") == string::npos) + if (lineVector[0][0] != '#') { return TAB_VALID; + } // comment or header line else { lineNum--;