From 761d4a58c738e61bac1ce2662cc52684755e94cb Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Fri, 30 Dec 2011 12:48:53 -0500 Subject: [PATCH] added cluster tool. both cluster and merge require sorted files for efficiency. --- Makefile | 1 + scripts/makeBashScripts.py | 3 +- src/bedtools.cpp | 3 + src/clusterBed/Makefile | 43 ++++++++++++ src/clusterBed/clusterBed.cpp | 115 +++++++++++++++++++++++++++++++ src/clusterBed/clusterBed.h | 46 +++++++++++++ src/clusterBed/clusterMain.cpp | 121 +++++++++++++++++++++++++++++++++ src/mergeBed/mergeBed.cpp | 83 +++++++++++----------- src/utils/bedFile/bedFile.cpp | 6 +- 9 files changed, 374 insertions(+), 47 deletions(-) create mode 100644 src/clusterBed/Makefile create mode 100644 src/clusterBed/clusterBed.cpp create mode 100644 src/clusterBed/clusterBed.h create mode 100644 src/clusterBed/clusterMain.cpp diff --git a/Makefile b/Makefile index 167fc1e8..c1cdc30a 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/bedToIgv \ $(SRC_DIR)/bed12ToBed6 \ $(SRC_DIR)/closestBed \ + $(SRC_DIR)/clusterBed \ $(SRC_DIR)/complementBed \ $(SRC_DIR)/coverageBed \ $(SRC_DIR)/fastaFromBed \ diff --git a/scripts/makeBashScripts.py b/scripts/makeBashScripts.py index 410f5ffb..3e9c4128 100644 --- a/scripts/makeBashScripts.py +++ b/scripts/makeBashScripts.py @@ -14,7 +14,8 @@ def main(): 'bed12tobed6': 'bed12ToBed6', 'bedpetobam': 'bedpeToBam', 'bedtobam': 'bedToBam', - 'closest': 'closestBed', + 'closest': 'closestBed', + 'cluster': 'clusterBed', 'complement': 'complementBed', 'coverage': 'coverageBed', 'flank': 'flankBed', diff --git a/src/bedtools.cpp b/src/bedtools.cpp index 0bcc5c2a..5e294a9c 100644 --- a/src/bedtools.cpp +++ b/src/bedtools.cpp @@ -41,6 +41,7 @@ int bedtobam_main(int argc, char* argv[]);// int bedtoigv_main(int argc, char* argv[]);// int bedpetobam_main(int argc, char* argv[]);// int closest_main(int argc, char* argv[]); // +int cluster_main(int argc, char* argv[]); // int complement_main(int argc, char* argv[]);// int coverage_main(int argc, char* argv[]); // int fastafrombed_main(int argc, char* argv[]);// @@ -82,6 +83,7 @@ int main(int argc, char *argv[]) else if (sub_cmd == "coverage") return coverage_main(argc-1, argv+1); else if (sub_cmd == "genomecov") return genomecoverage_main(argc-1, argv+1); else if (sub_cmd == "merge") return merge_main(argc-1, argv+1); + else if (sub_cmd == "cluster") return cluster_main(argc-1, argv+1); else if (sub_cmd == "complement") return complement_main(argc-1, argv+1); else if (sub_cmd == "subtract") return subtract_main(argc-1, argv+1); else if (sub_cmd == "slop") return slop_main(argc-1, argv+1); @@ -171,6 +173,7 @@ int bedtools_help(void) cout << " coverage " << "Compute the coverage over defined intervals.\n"; cout << " genomecov " << "Compute the coverage over an entire genome.\n"; cout << " merge " << "Combine overlapping/nearby intervals into a single interval.\n"; + cout << " cluster " << "Cluster (but don't merge) overlapping/nearby intervals.\n"; cout << " complement " << "Extract intervals _not_ represented by an interval file.\n"; cout << " subtract " << "Remove intervals based on overlaps b/w two files.\n"; cout << " slop " << "Adjust the size of intervals.\n"; diff --git a/src/clusterBed/Makefile b/src/clusterBed/Makefile new file mode 100644 index 00000000..59832db5 --- /dev/null +++ b/src/clusterBed/Makefile @@ -0,0 +1,43 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= clusterMain.cpp clusterBed.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean \ No newline at end of file diff --git a/src/clusterBed/clusterBed.cpp b/src/clusterBed/clusterBed.cpp new file mode 100644 index 00000000..94b8a809 --- /dev/null +++ b/src/clusterBed/clusterBed.cpp @@ -0,0 +1,115 @@ +/***************************************************************************** + clusterBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "clusterBed.h" + +// = Constructor = +BedCluster::BedCluster(string &bedFile, + int maxDistance, + bool forceStrand) + : + _bedFile(bedFile), + _forceStrand(forceStrand), + _maxDistance(maxDistance) +{ + _bed = new BedFile(bedFile); + + if (_forceStrand == false) + ClusterBed(); + else + ClusterBedStranded(); +} + + +// = Destructor = +BedCluster::~BedCluster(void) +{} + + +// = Cluster overlapping or nearby BED entries = +void BedCluster::ClusterBed() { + + uint32_t cluster_id = 0; + BED prev, curr; + int end = -1; + + _bed->Open(); + while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals + if (_bed->_status != BED_VALID) + continue; + // new cluster, no overlap + if ( (((int) curr.start - end) > _maxDistance) || + (curr.chrom != prev.chrom) + ) + { + cluster_id++; + end = curr.end; + } + prev = curr; + _bed->reportBedTab(curr); + printf("%d\n", cluster_id); + } +} + + +// = Cluster overlapping BED entries, accounting for strandedness = +void BedCluster::ClusterBedStranded() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + // loop through each chromosome and merge their BED entries + masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); + masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); + for (; m != mEnd; ++m) { + + // bedList is already sorted by start position. + string chrom = m->first; + vector<BED> bedList = m->second; + + // make a list of the two strands to merge separately. + vector<string> strands(2); + strands[0] = "+"; + strands[1] = "-"; + uint32_t cluster_id = 0; + // do two passes, one for each strand. + for (unsigned int s = 0; s < strands.size(); s++) { + // cluster overlapping features for this chromosome. + int end = -1; + BED prev; + vector<BED>::const_iterator bedItr = bedList.begin(); + vector<BED>::const_iterator bedEnd = bedList.end(); + for (; bedItr != bedEnd; ++bedItr) { + // if forcing strandedness, move on if the hit + // is not on the current strand. + if (bedItr->strand != strands[s]) + continue; + + // new cluster, no overlap + if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) + { + cluster_id++; + end = bedItr->end; + } + // same cluster, overlaps + else { + if ((int) bedItr->end > end) + end = bedItr->end; + } + prev = *bedItr; + _bed->reportBedTab(prev); + printf("%d\n", cluster_id); + } + } + } +} diff --git a/src/clusterBed/clusterBed.h b/src/clusterBed/clusterBed.h new file mode 100644 index 00000000..18eab172 --- /dev/null +++ b/src/clusterBed/clusterBed.h @@ -0,0 +1,46 @@ +/***************************************************************************** + clusterBed.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "bedFile.h" +#include <vector> +#include <algorithm> +#include <numeric> +#include <iostream> +#include <fstream> +#include <limits.h> +#include <stdlib.h> + +using namespace std; + + +//************************************************ +// Class methods and elements +//************************************************ +class BedCluster { + +public: + + // constructor + BedCluster(string &bedFile, int maxDistance, bool forceStrand); + // destructor + ~BedCluster(void); + // find clusters + void ClusterBed(); + // find clusters based on strand + void ClusterBedStranded(); + +private: + string _bedFile; + bool _forceStrand; + int _maxDistance; + // instance of a bed file class. + BedFile *_bed; +}; diff --git a/src/clusterBed/clusterMain.cpp b/src/clusterBed/clusterMain.cpp new file mode 100644 index 00000000..243be297 --- /dev/null +++ b/src/clusterBed/clusterMain.cpp @@ -0,0 +1,121 @@ +/***************************************************************************** + clusterMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "clusterBed.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "bedtools cluster" + + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void cluster_help(void); + +int cluster_main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + int maxDistance = 0; + + // input arguments + bool haveBed = true; + bool haveMaxDistance = false; + bool forceStrand = false; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) cluster_help(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-d", 2, parameterLength)) { + if ((i+1) < argc) { + haveMaxDistance = true; + maxDistance = atoi(argv[i + 1]); + i++; + } + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedCluster *bc = new BedCluster(bedFile, maxDistance, forceStrand); + delete bc; + } + else { + cluster_help(); + } + return 0; +} + +void cluster_help(void) { + + cerr << "\nTool: bedtools cluster" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Clusters overlapping/nearby BED/GFF/VCF intervals." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl; + cerr << "\t\tthat are the same strand." << endl; + cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; + + cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl; + cerr << "\t\tto be merged." << endl; + cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl; + cerr << "\t will in BED format with zero-based starts" << endl << endl; + + + // end the program here + exit(1); + +} diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp index 393eefea..6521e4a7 100644 --- a/src/mergeBed/mergeBed.cpp +++ b/src/mergeBed/mergeBed.cpp @@ -241,54 +241,47 @@ void BedMerge::ReportStranded(string chrom, int start, int end, // = Merge overlapping BED entries into a single entry = // ===================================================== void BedMerge::MergeBed() { - - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - // loop through each chromosome and merge their BED entries - masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); - masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); - for (; m != mEnd; ++m) { - - // bedList is already sorted by start position. - string chrom = m->first; - vector<BED> bedList = m->second; - int mergeCount = 1; - vector<string> names; - vector<string> scores; - - // merge overlapping features for this chromosome. - int start = -1; - int end = -1; - vector<BED>::const_iterator bedItr = bedList.begin(); - vector<BED>::const_iterator bedEnd = bedList.end(); - for (; bedItr != bedEnd; ++bedItr) { - // new block, no overlap - if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { - if (start >= 0) { - Report(chrom, start, end, names, scores, mergeCount); - // reset - mergeCount = 1; - names.clear(); - scores.clear(); - } - start = bedItr->start; - end = bedItr->end; - if (!bedItr->name.empty()) names.push_back(bedItr->name); - if (!bedItr->score.empty()) scores.push_back(bedItr->score); - } - // same block, overlaps - else { - if ((int) bedItr-> end > end) end = bedItr->end; - mergeCount++; - if (!bedItr->name.empty()) names.push_back(bedItr->name); - if (!bedItr->score.empty()) scores.push_back(bedItr->score); + int mergeCount = 1; + vector<string> names; + vector<string> scores; + int start = -1; + int end = -1; + BED prev, curr; + + _bed->Open(); + while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals + if (_bed->_status != BED_VALID) + continue; + // new block, no overlap + if ( (((int) curr.start - end) > _maxDistance) || (curr.chrom != prev.chrom)) { + if (start >= 0) { + Report(prev.chrom, start, end, names, scores, mergeCount); + // reset + mergeCount = 1; + names.clear(); + scores.clear(); } + start = curr.start; + end = curr.end; + if (!curr.name.empty()) + names.push_back(curr.name); + if (!curr.score.empty()) + scores.push_back(curr.score); } - if (start >= 0) { - Report(chrom, start, end, names, scores, mergeCount); + // same block, overlaps + else { + if ((int) curr.end > end) + end = curr.end; + if (!curr.name.empty()) + names.push_back(curr.name); + if (!curr.score.empty()) + scores.push_back(curr.score); + mergeCount++; } + prev = curr; + } + if (start >= 0) { + Report(prev.chrom, start, end, names, scores, mergeCount); } } diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 221bb220..d3eaa988 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -242,7 +242,11 @@ bool BedFile::GetNextBed(BED &bed, bool forceSorted) { _prev_start = bed.start; } else if (forceSorted) { - cerr << "ERROR: input file: (" << bedFile << ") is not sorted by chrom then start" << endl; + cerr << "ERROR: input file: (" << bedFile + << ") is not sorted by chrom then start." << endl + << " The start coordinate at line " << _lineNum + << " is less than the start at line " << _lineNum-1 + << endl; exit(1); } } -- GitLab