Skip to content
Snippets Groups Projects
Commit 761d4a58 authored by Aaron's avatar Aaron
Browse files

added cluster tool. both cluster and merge require sorted files for efficiency.

parent c045c254
No related branches found
No related tags found
No related merge requests found
......@@ -23,6 +23,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/bedToIgv \
$(SRC_DIR)/bed12ToBed6 \
$(SRC_DIR)/closestBed \
$(SRC_DIR)/clusterBed \
$(SRC_DIR)/complementBed \
$(SRC_DIR)/coverageBed \
$(SRC_DIR)/fastaFromBed \
......
......@@ -14,7 +14,8 @@ def main():
'bed12tobed6': 'bed12ToBed6',
'bedpetobam': 'bedpeToBam',
'bedtobam': 'bedToBam',
'closest': 'closestBed',
'closest': 'closestBed',
'cluster': 'clusterBed',
'complement': 'complementBed',
'coverage': 'coverageBed',
'flank': 'flankBed',
......
......@@ -41,6 +41,7 @@ int bedtobam_main(int argc, char* argv[]);//
int bedtoigv_main(int argc, char* argv[]);//
int bedpetobam_main(int argc, char* argv[]);//
int closest_main(int argc, char* argv[]); //
int cluster_main(int argc, char* argv[]); //
int complement_main(int argc, char* argv[]);//
int coverage_main(int argc, char* argv[]); //
int fastafrombed_main(int argc, char* argv[]);//
......@@ -82,6 +83,7 @@ int main(int argc, char *argv[])
else if (sub_cmd == "coverage") return coverage_main(argc-1, argv+1);
else if (sub_cmd == "genomecov") return genomecoverage_main(argc-1, argv+1);
else if (sub_cmd == "merge") return merge_main(argc-1, argv+1);
else if (sub_cmd == "cluster") return cluster_main(argc-1, argv+1);
else if (sub_cmd == "complement") return complement_main(argc-1, argv+1);
else if (sub_cmd == "subtract") return subtract_main(argc-1, argv+1);
else if (sub_cmd == "slop") return slop_main(argc-1, argv+1);
......@@ -171,6 +173,7 @@ int bedtools_help(void)
cout << " coverage " << "Compute the coverage over defined intervals.\n";
cout << " genomecov " << "Compute the coverage over an entire genome.\n";
cout << " merge " << "Combine overlapping/nearby intervals into a single interval.\n";
cout << " cluster " << "Cluster (but don't merge) overlapping/nearby intervals.\n";
cout << " complement " << "Extract intervals _not_ represented by an interval file.\n";
cout << " subtract " << "Remove intervals based on overlaps b/w two files.\n";
cout << " slop " << "Adjust the size of intervals.\n";
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= clusterMain.cpp clusterBed.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
.PHONY: clean
\ No newline at end of file
/*****************************************************************************
clusterBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "clusterBed.h"
// = Constructor =
BedCluster::BedCluster(string &bedFile,
int maxDistance,
bool forceStrand)
:
_bedFile(bedFile),
_forceStrand(forceStrand),
_maxDistance(maxDistance)
{
_bed = new BedFile(bedFile);
if (_forceStrand == false)
ClusterBed();
else
ClusterBedStranded();
}
// = Destructor =
BedCluster::~BedCluster(void)
{}
// = Cluster overlapping or nearby BED entries =
void BedCluster::ClusterBed() {
uint32_t cluster_id = 0;
BED prev, curr;
int end = -1;
_bed->Open();
while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals
if (_bed->_status != BED_VALID)
continue;
// new cluster, no overlap
if ( (((int) curr.start - end) > _maxDistance) ||
(curr.chrom != prev.chrom)
)
{
cluster_id++;
end = curr.end;
}
prev = curr;
_bed->reportBedTab(curr);
printf("%d\n", cluster_id);
}
}
// = Cluster overlapping BED entries, accounting for strandedness =
void BedCluster::ClusterBedStranded() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second;
// make a list of the two strands to merge separately.
vector<string> strands(2);
strands[0] = "+";
strands[1] = "-";
uint32_t cluster_id = 0;
// do two passes, one for each strand.
for (unsigned int s = 0; s < strands.size(); s++) {
// cluster overlapping features for this chromosome.
int end = -1;
BED prev;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
// if forcing strandedness, move on if the hit
// is not on the current strand.
if (bedItr->strand != strands[s])
continue;
// new cluster, no overlap
if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0))
{
cluster_id++;
end = bedItr->end;
}
// same cluster, overlaps
else {
if ((int) bedItr->end > end)
end = bedItr->end;
}
prev = *bedItr;
_bed->reportBedTab(prev);
printf("%d\n", cluster_id);
}
}
}
}
/*****************************************************************************
clusterBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "bedFile.h"
#include <vector>
#include <algorithm>
#include <numeric>
#include <iostream>
#include <fstream>
#include <limits.h>
#include <stdlib.h>
using namespace std;
//************************************************
// Class methods and elements
//************************************************
class BedCluster {
public:
// constructor
BedCluster(string &bedFile, int maxDistance, bool forceStrand);
// destructor
~BedCluster(void);
// find clusters
void ClusterBed();
// find clusters based on strand
void ClusterBedStranded();
private:
string _bedFile;
bool _forceStrand;
int _maxDistance;
// instance of a bed file class.
BedFile *_bed;
};
/*****************************************************************************
clusterMain.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "clusterBed.h"
#include "version.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "bedtools cluster"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void cluster_help(void);
int cluster_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedFile = "stdin";
int maxDistance = 0;
// input arguments
bool haveBed = true;
bool haveMaxDistance = false;
bool forceStrand = false;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) cluster_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if ((i+1) < argc) {
bedFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-d", 2, parameterLength)) {
if ((i+1) < argc) {
haveMaxDistance = true;
maxDistance = atoi(argv[i + 1]);
i++;
}
}
else if (PARAMETER_CHECK("-s", 2, parameterLength)) {
forceStrand = true;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBed) {
cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedCluster *bc = new BedCluster(bedFile, maxDistance, forceStrand);
delete bc;
}
else {
cluster_help();
}
return 0;
}
void cluster_help(void) {
cerr << "\nTool: bedtools cluster" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Clusters overlapping/nearby BED/GFF/VCF intervals." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl;
cerr << "\t\tthat are the same strand." << endl;
cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl;
cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl;
cerr << "\t\tto be merged." << endl;
cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl;
cerr << "\t will in BED format with zero-based starts" << endl << endl;
// end the program here
exit(1);
}
......@@ -241,54 +241,47 @@ void BedMerge::ReportStranded(string chrom, int start, int end,
// = Merge overlapping BED entries into a single entry =
// =====================================================
void BedMerge::MergeBed() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second;
int mergeCount = 1;
vector<string> names;
vector<string> scores;
// merge overlapping features for this chromosome.
int start = -1;
int end = -1;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
// new block, no overlap
if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
if (start >= 0) {
Report(chrom, start, end, names, scores, mergeCount);
// reset
mergeCount = 1;
names.clear();
scores.clear();
}
start = bedItr->start;
end = bedItr->end;
if (!bedItr->name.empty()) names.push_back(bedItr->name);
if (!bedItr->score.empty()) scores.push_back(bedItr->score);
}
// same block, overlaps
else {
if ((int) bedItr-> end > end) end = bedItr->end;
mergeCount++;
if (!bedItr->name.empty()) names.push_back(bedItr->name);
if (!bedItr->score.empty()) scores.push_back(bedItr->score);
int mergeCount = 1;
vector<string> names;
vector<string> scores;
int start = -1;
int end = -1;
BED prev, curr;
_bed->Open();
while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals
if (_bed->_status != BED_VALID)
continue;
// new block, no overlap
if ( (((int) curr.start - end) > _maxDistance) || (curr.chrom != prev.chrom)) {
if (start >= 0) {
Report(prev.chrom, start, end, names, scores, mergeCount);
// reset
mergeCount = 1;
names.clear();
scores.clear();
}
start = curr.start;
end = curr.end;
if (!curr.name.empty())
names.push_back(curr.name);
if (!curr.score.empty())
scores.push_back(curr.score);
}
if (start >= 0) {
Report(chrom, start, end, names, scores, mergeCount);
// same block, overlaps
else {
if ((int) curr.end > end)
end = curr.end;
if (!curr.name.empty())
names.push_back(curr.name);
if (!curr.score.empty())
scores.push_back(curr.score);
mergeCount++;
}
prev = curr;
}
if (start >= 0) {
Report(prev.chrom, start, end, names, scores, mergeCount);
}
}
......
......@@ -242,7 +242,11 @@ bool BedFile::GetNextBed(BED &bed, bool forceSorted) {
_prev_start = bed.start;
}
else if (forceSorted) {
cerr << "ERROR: input file: (" << bedFile << ") is not sorted by chrom then start" << endl;
cerr << "ERROR: input file: (" << bedFile
<< ") is not sorted by chrom then start." << endl
<< " The start coordinate at line " << _lineNum
<< " is less than the start at line " << _lineNum-1
<< endl;
exit(1);
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment