Skip to content
Snippets Groups Projects
Commit e23d18d9 authored by Aaron's avatar Aaron
Browse files

Removed groupBy from the BEDTools distribution.

It now lives in the "filo" package.
parent 4a64280e
No related branches found
No related tags found
No related merge requests found
...@@ -22,8 +22,8 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ ...@@ -22,8 +22,8 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/complementBed \ $(SRC_DIR)/complementBed \
$(SRC_DIR)/coverageBed \ $(SRC_DIR)/coverageBed \
$(SRC_DIR)/fastaFromBed \ $(SRC_DIR)/fastaFromBed \
$(SRC_DIR)/fjoin \
$(SRC_DIR)/genomeCoverageBed \ $(SRC_DIR)/genomeCoverageBed \
$(SRC_DIR)/groupBy \
$(SRC_DIR)/intersectBed \ $(SRC_DIR)/intersectBed \
$(SRC_DIR)/linksBed \ $(SRC_DIR)/linksBed \
$(SRC_DIR)/maskFastaFromBed \ $(SRC_DIR)/maskFastaFromBed \
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/tabFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= groupBy.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=tabFile.o lineFileUtilities.o gzstream.o fileType.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= groupBy
all: $(PROGRAM)
.PHONY: all
$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
@echo " * linking $(PROGRAM)"
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/tabFile/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
.PHONY: clean
/*****************************************************************************
groupBy.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include <vector>
#include <map>
#include <numeric>
#include <iterator>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <stdlib.h>
#include <math.h>
#include <limits.h>
#include <string.h>
#include <exception>
#include <stdexcept> // out_of_range exception
#include "version.h"
#include "lineFileUtilities.h"
#include "tabFile.h"
using namespace std;
const int PRECISION = 21;
// define our program name
#define PROGRAM_NAME "groupBy"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
#define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-')
struct ValueGreaterThan
{
bool operator()( const vector< pair<int, string> >::value_type& lhs,
const vector< pair<int, string> >::value_type& rhs ) const
{
return lhs.first > rhs.first;
}
};
struct ValueLessThan
{
bool operator()( const vector< pair<int, string> >::value_type& lhs,
const vector< pair<int, string> >::value_type& rhs ) const
{
return lhs.first < rhs.first;
}
};
// function declarations
void ShowHelp(void);
void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops);
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops);
void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum);
float ToFloat (string element);
double ToDouble(const string &element);
void TabPrintPost (string element);
void TabPrintPre (string element);
void CommaPrint (string element);
int main(int argc, char* argv[]) {
// input files
string inFile = "stdin";
string groupColumnsString = "1,2,3";
string opsColumnString;
string opsString;
// our configuration variables
bool showHelp = false;
bool haveOpColumns = false;
bool haveOps = true;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) ShowHelp();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -i parameter requires a value." << endl << endl;
ShowHelp();
break;
}
else {
inFile = argv[i + 1];
i++;
}
}
else if (PARAMETER_CHECK("-grp", 4, parameterLength) || PARAMETER_CHECK("-g", 2, parameterLength)) {
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -grp parameter requires a value." << endl << endl;
ShowHelp();
break;
}
else {
groupColumnsString = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-opCols", 7, parameterLength) || PARAMETER_CHECK("-c", 2, parameterLength)) {
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl;
ShowHelp();
break;
}
else {
haveOpColumns = true;
opsColumnString = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-ops", 4, parameterLength) || PARAMETER_CHECK("-o", 2, parameterLength)) {
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -ops parameter requires a value." << endl << endl;
ShowHelp();
break;
}
else {
haveOps = true;
opsString = argv[i + 1];
i++;
}
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
if (!haveOpColumns) {
cerr << endl << "*****" << endl << "*****ERROR: Need -opCols." << endl << "*****" << endl;
showHelp = true;
}
// split the opsString into discrete operations and make sure they are all valid.
vector<string> ops;
Tokenize(opsString, ops, ",");
for( size_t i = 0; i < ops.size(); i++ ) {
if ((ops[i] != "sum") && (ops[i] != "max") && (ops[i] != "min") && (ops[i] != "mean") &&
(ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") &&
(ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "freqdesc") &&
(ops[i] != "freqasc")) {
cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "*****" << endl;
showHelp = true;
}
}
if (!showHelp) {
// Split the column string sent by the user into discrete column numbers
// A comma separated string is expected.
vector<int> groupColumnsInt;
Tokenize(groupColumnsString, groupColumnsInt, ",");
vector<int> opColumnsInt;
Tokenize(opsColumnString, opColumnsInt, ",");
// sanity check the group columns
for(size_t i = 0; i < groupColumnsInt.size(); ++i) {
int groupColumnInt = groupColumnsInt[i];
if (groupColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl;
ShowHelp();
}
}
// sanity check the op columns
for(size_t i = 0; i < opColumnsInt.size(); ++i) {
int opColumnInt = opColumnsInt[i];
if (opColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl;
ShowHelp();
}
}
// sanity check that there are equal number of opColumns and ops
if (ops.size() != opColumnsInt.size()) {
cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl;
ShowHelp();
}
GroupBy(inFile, groupColumnsInt, opColumnsInt, ops);
}
else {
ShowHelp();
}
}
void ShowHelp(void) {
cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl;
cerr << "Summary: Summarizes a dataset column based upon" << endl;
cerr << "\t common column groupings. Akin to the SQL \"group by\" command." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " -i <input> -g <group_column(s)> -c <op_column(s)> -o <ops> " << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-i\t\t" << "Input file. Use \"stdin\" for pipes." << endl << endl;
cerr << "\t-g -grp\t\t" << "Specify the columns (1-based) for the grouping." << endl;
cerr << "\t\t\tThe columns must be comma separated." << endl;
cerr << "\t\t\t- Default: 1,2,3" << endl << endl;
cerr << "\t-c -opCols\t" << "Specify the column (1-based) that should be summarized." << endl;
cerr << "\t\t\t- Required." << endl << endl;
cerr << "\t-o -ops\t\t" << "Specify the operation that should be applied to opCol." << endl;
cerr << "\t\t\tValid operations:" << endl;
cerr << "\t\t\t sum, count, min, max," << endl;
cerr << "\t\t\t mean, median, mode, antimode," << endl;
cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl;
cerr << "\t\t\t collapse (i.e., print a comma separated list), " << endl;
cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl;
cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl;
cerr << "\t\t\t- Default: sum" << endl << endl;
cerr << "Examples: " << endl;
cerr << "\t$ cat ex1.out" << endl;
cerr << "\tchr1 10 20 A chr1 15 25 B.1 1000" << endl;
cerr << "\tchr1 10 20 A chr1 25 35 B.2 10000" << endl << endl;
cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum" << endl;
cerr << "\tchr1 10 20 A 11000" << endl << endl;
cerr << "\t$ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max" << endl;
cerr << "\tchr1 10 20 A 11000 10000" << endl << endl;
cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl;
cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The input file/stream should be sorted/grouped by the -grp. columns" << endl << endl;
cerr << "\t(2) If -i is unspecified, input is assumed to come from stdin." << endl << endl;
// end the program here
exit(1);
}
void GroupBy (const string &inFile,
const vector<int> &groupColumns,
const vector<int> &opColumns,
const vector<string> &ops) {
// current line number
int lineNum = 0;
// string representing current line
string inLine;
// vector of strings holding the tokenized current line
vector<string> inFields;
inFields.reserve(20);
// keys for the current and previous group
vector<string> prevGroup(0);
vector<string> currGroup(0);
// vector (one per column) of vector (one per value/column) of the opColumn values for the current group
vector< vector<string> > values;
for( size_t i = 0; i < opColumns.size(); i++ ) {
values.push_back( vector<string>() );
}
// check the status of the current line
TabLineStatus tabLineStatus;
// open a new tab file, loop through it line by line
// and summarize the data for a given group when the group
// fields change
TabFile *_tab = new TabFile(inFile);
_tab->Open();
while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) {
if (tabLineStatus == TAB_VALID) {
// build the group vector for the current line
currGroup.clear();
vector<int>::const_iterator gIt = groupColumns.begin();
vector<int>::const_iterator gEnd = groupColumns.end();
for (; gIt != gEnd; ++gIt)
addValue(inFields, currGroup, (*gIt-1), lineNum);
// there has been a group change
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group
ReportSummary(prevGroup, values, ops);
// reset and add the first value for the next group.
values.clear();
for( size_t i = 0; i < opColumns.size(); i++ ) {
values.push_back( vector<string>() );
addValue(inFields, values[i], opColumns[i]-1, lineNum);
}
}
// we're still dealing with the same group
else {
for( size_t i = 0; i < opColumns.size(); i++ )
addValue(inFields, values[i], opColumns[i]-1, lineNum);
}
// reset for the next line
prevGroup = currGroup;
}
inFields.clear();
}
// report the last group
ReportSummary(currGroup, values, ops);
_tab->Close();
}
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops) {
vector<string> result;
for( size_t i = 0; i < data.size(); i++ ) {
string op = ops[i];
std::stringstream buffer;
vector<double> dataF;
// are we doing a numeric conversion? if so, convert the strings to doubles.
if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") ||
(op == "median") || (op == "stdev") || (op == "sstdev"))
{
transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble);
}
if (op == "sum") {
// sum them up
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
buffer << setprecision (PRECISION) << total;
result.push_back(buffer.str());
}
else if (op == "collapse") {
string collapse;
for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
collapse.append(data[i][j]);
collapse.append(",");
}
result.push_back(collapse);
}
else if (op == "min") {
buffer << setprecision (PRECISION) << *min_element( dataF.begin(), dataF.end() );
result.push_back(buffer.str());
}
else if (op == "max") {
buffer << setprecision (PRECISION) << *max_element( dataF.begin(), dataF.end() );
result.push_back(buffer.str());
}
else if (op == "mean") {
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
buffer << setprecision (PRECISION) << mean;
result.push_back(buffer.str());
}
else if (op == "median") {
double median = 0.0;
sort(dataF.begin(), dataF.end());
int totalLines = dataF.size();
if ((totalLines % 2) > 0) {
long mid;
mid = totalLines / 2;
median = dataF[mid];
}
else {
long midLow, midHigh;
midLow = (totalLines / 2) - 1;
midHigh = (totalLines / 2);
median = (dataF[midLow] + dataF[midHigh]) / 2.0;
}
buffer << setprecision (PRECISION) << median;
result.push_back(buffer.str());
}
else if (op == "count") {
buffer << setprecision (PRECISION) << data[i].size();
result.push_back(buffer.str());
}
else if ((op == "mode") || (op == "antimode") ||
(op == "freqdesc") || (op == "freqasc")) {
// compute the frequency of each unique value
map<string, int> freqs;
vector<string>::const_iterator dIt = data[i].begin();
vector<string>::const_iterator dEnd = data[i].end();
for (; dIt != dEnd; ++dIt) {
freqs[*dIt]++;
}
// grab the mode and the anti mode
string mode, antiMode;
int count = 0;
int minCount = INT_MAX;
for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
if (iter->second > count) {
mode = iter->first;
count = iter->second;
}
if (iter->second < minCount) {
antiMode = iter->first;
minCount = iter->second;
}
}
// report
if (op == "mode") {
buffer << setprecision (PRECISION) << mode;
result.push_back(buffer.str());
}
else if (op == "antimode") {
buffer << setprecision (PRECISION) << antiMode;
result.push_back(buffer.str());
}
else if (op == "freqdesc" || op == "freqasc") {
// pair for the num times a values was
// observed (1) and the value itself (2)
pair<int, string> freqPair;
vector< pair<int, string> > freqList;
// create a list of pairs of all the observed values (second)
// and their occurences (first)
map<string,int>::const_iterator mapIter = freqs.begin();
map<string,int>::const_iterator mapEnd = freqs.end();
for(; mapIter != mapEnd; ++mapIter)
freqList.push_back( make_pair(mapIter->second, mapIter->first) );
// sort the list of pairs in the requested order by the frequency
// this will make the value that was observed least/most bubble to the top
if (op == "freqdesc")
sort(freqList.begin(), freqList.end(), ValueGreaterThan());
else if (op == "freqasc")
sort(freqList.begin(), freqList.end(), ValueLessThan());
// record all of the values and their frequencies.
vector< pair<int, string> >::const_iterator iter = freqList.begin();
vector< pair<int, string> >::const_iterator iterEnd = freqList.end();
for (; iter != iterEnd; ++iter)
buffer << iter->second << ":" << iter->first << ",";
result.push_back(buffer.str());
}
}
else if (op == "stdev" || op == "sstdev") {
// get the mean
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
// get the variance
double totalVariance = 0.0;
vector<double>::const_iterator dIt = dataF.begin();
vector<double>::const_iterator dEnd = dataF.end();
for (; dIt != dEnd; ++dIt) {
totalVariance += pow((*dIt - mean),2);
}
double variance = 0.0;
if (op == "stdev") {
variance = totalVariance / dataF.size();
}
else if (op == "sstdev" && dataF.size() > 1) {
variance = totalVariance / (dataF.size() - 1);
}
double stddev = sqrt(variance);
// report
buffer << setprecision (PRECISION) << stddev;
result.push_back(buffer.str());
}
}
for_each(group.begin(), group.end(), TabPrintPost);
cout << *result.begin();
for_each(++result.begin(), result.end(), TabPrintPre);
cout << endl; //Gets rid of extraneous tab
}
void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum) {
try {
toList.push_back(fromList.at(index));
}
catch(std::out_of_range& e) {
cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line "
<< lineNum << ". Exiting." << endl << "*****" << endl;
exit(1);
}
}
float ToFloat (string element) {
return atof(element.c_str());
}
void TabPrintPost (string element) {
cout << element << "\t";
}
void TabPrintPre (string element) {
cout << "\t" << element;
}
void CommaPrint (string element) {
cout << element << ",";
}
double ToDouble(const string &element) {
std::istringstream i(element);
double x;
if (!(i >> x)) {
cerr << "Error: Could not properly convert string to numeric (\"" + element + "\")" << endl;
exit(1);
}
return x;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment