From 761d4a58c738e61bac1ce2662cc52684755e94cb Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Fri, 30 Dec 2011 12:48:53 -0500
Subject: [PATCH] added cluster tool.  both cluster and merge require sorted
 files for efficiency.

---
 Makefile                       |   1 +
 scripts/makeBashScripts.py     |   3 +-
 src/bedtools.cpp               |   3 +
 src/clusterBed/Makefile        |  43 ++++++++++++
 src/clusterBed/clusterBed.cpp  | 115 +++++++++++++++++++++++++++++++
 src/clusterBed/clusterBed.h    |  46 +++++++++++++
 src/clusterBed/clusterMain.cpp | 121 +++++++++++++++++++++++++++++++++
 src/mergeBed/mergeBed.cpp      |  83 +++++++++++-----------
 src/utils/bedFile/bedFile.cpp  |   6 +-
 9 files changed, 374 insertions(+), 47 deletions(-)
 create mode 100644 src/clusterBed/Makefile
 create mode 100644 src/clusterBed/clusterBed.cpp
 create mode 100644 src/clusterBed/clusterBed.h
 create mode 100644 src/clusterBed/clusterMain.cpp

diff --git a/Makefile b/Makefile
index 167fc1e8..c1cdc30a 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
 		  $(SRC_DIR)/bedToIgv \
 		  $(SRC_DIR)/bed12ToBed6 \
 		  $(SRC_DIR)/closestBed \
+		  $(SRC_DIR)/clusterBed \
 		  $(SRC_DIR)/complementBed \
 		  $(SRC_DIR)/coverageBed \
 		  $(SRC_DIR)/fastaFromBed \
diff --git a/scripts/makeBashScripts.py b/scripts/makeBashScripts.py
index 410f5ffb..3e9c4128 100644
--- a/scripts/makeBashScripts.py
+++ b/scripts/makeBashScripts.py
@@ -14,7 +14,8 @@ def main():
                  'bed12tobed6': 'bed12ToBed6', 
                  'bedpetobam': 'bedpeToBam', 
                  'bedtobam': 'bedToBam', 
-                 'closest': 'closestBed', 
+                 'closest': 'closestBed',
+                 'cluster': 'clusterBed',                 
                  'complement': 'complementBed',
                  'coverage': 'coverageBed', 
                  'flank': 'flankBed', 
diff --git a/src/bedtools.cpp b/src/bedtools.cpp
index 0bcc5c2a..5e294a9c 100644
--- a/src/bedtools.cpp
+++ b/src/bedtools.cpp
@@ -41,6 +41,7 @@ int bedtobam_main(int argc, char* argv[]);//
 int bedtoigv_main(int argc, char* argv[]);//
 int bedpetobam_main(int argc, char* argv[]);//
 int closest_main(int argc, char* argv[]); //
+int cluster_main(int argc, char* argv[]); //
 int complement_main(int argc, char* argv[]);//
 int coverage_main(int argc, char* argv[]); //
 int fastafrombed_main(int argc, char* argv[]);//
@@ -82,6 +83,7 @@ int main(int argc, char *argv[])
     else if (sub_cmd == "coverage")    return coverage_main(argc-1, argv+1);
     else if (sub_cmd == "genomecov")   return genomecoverage_main(argc-1, argv+1);
     else if (sub_cmd == "merge")       return merge_main(argc-1, argv+1);
+    else if (sub_cmd == "cluster")     return cluster_main(argc-1, argv+1);    
     else if (sub_cmd == "complement")  return complement_main(argc-1, argv+1);
     else if (sub_cmd == "subtract")    return subtract_main(argc-1, argv+1);
     else if (sub_cmd == "slop")        return slop_main(argc-1, argv+1);
@@ -171,6 +173,7 @@ int bedtools_help(void)
     cout  << "    coverage      "  << "Compute the coverage over defined intervals.\n";
     cout  << "    genomecov     "  << "Compute the coverage over an entire genome.\n";
     cout  << "    merge         "  << "Combine overlapping/nearby intervals into a single interval.\n";
+    cout  << "    cluster       "  << "Cluster (but don't merge) overlapping/nearby intervals.\n";
     cout  << "    complement    "  << "Extract intervals _not_ represented by an interval file.\n";
     cout  << "    subtract      "  << "Remove intervals based on overlaps b/w two files.\n";
     cout  << "    slop          "  << "Adjust the size of intervals.\n";
diff --git a/src/clusterBed/Makefile b/src/clusterBed/Makefile
new file mode 100644
index 00000000..59832db5
--- /dev/null
+++ b/src/clusterBed/Makefile
@@ -0,0 +1,43 @@
+UTILITIES_DIR = ../utils/
+OBJ_DIR = ../../obj/
+BIN_DIR = ../../bin/
+
+# -------------------
+# define our includes
+# -------------------
+INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
+           -I$(UTILITIES_DIR)/lineFileUtilities/ \
+           -I$(UTILITIES_DIR)/gzstream/ \
+           -I$(UTILITIES_DIR)/fileType/ \
+           -I$(UTILITIES_DIR)/version/
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= clusterMain.cpp clusterBed.cpp
+OBJECTS= $(SOURCES:.cpp=.o)
+_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o
+EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+
+
+all: $(BUILT_OBJECTS)
+
+.PHONY: all
+
+$(BUILT_OBJECTS): $(SOURCES)
+	@echo "  * compiling" $(*F).cpp
+	@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+$(EXT_OBJECTS):
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
+			
+clean:
+	@echo "Cleaning up."
+	@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
+
+.PHONY: clean
\ No newline at end of file
diff --git a/src/clusterBed/clusterBed.cpp b/src/clusterBed/clusterBed.cpp
new file mode 100644
index 00000000..94b8a809
--- /dev/null
+++ b/src/clusterBed/clusterBed.cpp
@@ -0,0 +1,115 @@
+/*****************************************************************************
+  clusterBed.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "clusterBed.h"
+
+// = Constructor =
+BedCluster::BedCluster(string &bedFile, 
+                   int  maxDistance, 
+                   bool forceStrand) 
+    :
+    _bedFile(bedFile),
+    _forceStrand(forceStrand),
+    _maxDistance(maxDistance)
+{
+    _bed = new BedFile(bedFile);
+
+    if (_forceStrand == false)
+        ClusterBed();
+    else
+        ClusterBedStranded();
+}
+
+
+// = Destructor =
+BedCluster::~BedCluster(void) 
+{}
+
+
+// = Cluster overlapping or nearby BED entries =
+void BedCluster::ClusterBed() {
+
+    uint32_t cluster_id = 0;
+    BED prev, curr;
+    int end   = -1;
+    
+    _bed->Open();
+    while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals
+        if (_bed->_status != BED_VALID)
+            continue;            
+        // new cluster, no overlap
+        if ( (((int) curr.start - end) > _maxDistance) || 
+             (curr.chrom != prev.chrom)
+           ) 
+        {
+            cluster_id++;
+            end   = curr.end;
+        }
+        prev = curr;
+        _bed->reportBedTab(curr);
+        printf("%d\n", cluster_id);
+    }
+}
+
+
+// = Cluster overlapping BED entries, accounting for strandedness =
+void BedCluster::ClusterBedStranded() {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bed->loadBedFileIntoMapNoBin();
+    
+    // loop through each chromosome and merge their BED entries
+    masterBedMapNoBin::const_iterator m    = _bed->bedMapNoBin.begin();
+    masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
+    for (; m != mEnd; ++m) {
+        
+        // bedList is already sorted by start position.
+        string chrom        = m->first;
+        vector<BED> bedList = m->second;
+    
+        // make a list of the two strands to merge separately.
+        vector<string> strands(2);
+        strands[0] = "+";
+        strands[1] = "-";
+        uint32_t cluster_id = 0;
+        // do two passes, one for each strand.
+        for (unsigned int s = 0; s < strands.size(); s++) {
+            // cluster overlapping features for this chromosome.
+            int end   = -1;
+            BED prev;    
+            vector<BED>::const_iterator bedItr = bedList.begin();
+            vector<BED>::const_iterator bedEnd = bedList.end();
+            for (; bedItr != bedEnd; ++bedItr) {
+                // if forcing strandedness, move on if the hit
+                // is not on the current strand.
+                if (bedItr->strand != strands[s])
+                    continue;
+                
+                // new cluster, no overlap
+                if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) 
+                {
+                    cluster_id++;
+                    end   = bedItr->end;
+                }
+                // same cluster, overlaps
+                else {
+                    if ((int) bedItr->end > end) 
+                        end = bedItr->end;
+                }
+                prev = *bedItr;
+                _bed->reportBedTab(prev);
+                printf("%d\n", cluster_id);
+            }
+        }
+    }
+}
diff --git a/src/clusterBed/clusterBed.h b/src/clusterBed/clusterBed.h
new file mode 100644
index 00000000..18eab172
--- /dev/null
+++ b/src/clusterBed/clusterBed.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+  clusterBed.h
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "bedFile.h"
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <iostream>
+#include <fstream>
+#include <limits.h>
+#include <stdlib.h>
+
+using namespace std;
+
+
+//************************************************
+// Class methods and elements
+//************************************************
+class BedCluster {
+
+public:
+
+  // constructor
+  BedCluster(string &bedFile, int maxDistance, bool forceStrand);
+  // destructor
+  ~BedCluster(void);
+  // find clusters
+  void ClusterBed();
+  // find clusters based on strand
+  void ClusterBedStranded();
+
+private:
+    string _bedFile;
+    bool   _forceStrand;
+    int    _maxDistance;
+    // instance of a bed file class.
+    BedFile *_bed;    
+};
diff --git a/src/clusterBed/clusterMain.cpp b/src/clusterBed/clusterMain.cpp
new file mode 100644
index 00000000..243be297
--- /dev/null
+++ b/src/clusterBed/clusterMain.cpp
@@ -0,0 +1,121 @@
+/*****************************************************************************
+  clusterMain.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "clusterBed.h"
+#include "version.h"
+
+using namespace std;
+
+// define our program name
+#define PROGRAM_NAME "bedtools cluster"
+
+
+// define our parameter checking macro
+#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
+
+// function declarations
+void cluster_help(void);
+
+int cluster_main(int argc, char* argv[]) {
+
+    // our configuration variables
+    bool showHelp = false;
+
+    // input files
+    string bedFile  = "stdin";
+    int maxDistance = 0;
+
+    // input arguments
+    bool haveBed         = true;
+    bool haveMaxDistance = false;
+    bool forceStrand     = false;
+
+    for(int i = 1; i < argc; i++) {
+        int parameterLength = (int)strlen(argv[i]);
+
+        if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
+        (PARAMETER_CHECK("--help", 5, parameterLength))) {
+            showHelp = true;
+        }
+    }
+
+    if(showHelp) cluster_help();
+
+    // do some parsing (all of these parameters require 2 strings)
+    for(int i = 1; i < argc; i++) {
+
+        int parameterLength = (int)strlen(argv[i]);
+
+        if(PARAMETER_CHECK("-i", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                bedFile = argv[i + 1];
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-d", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                haveMaxDistance = true;
+                maxDistance = atoi(argv[i + 1]);
+                i++;
+            }
+        }
+        else if (PARAMETER_CHECK("-s", 2, parameterLength)) {
+            forceStrand = true;
+        }
+        else {
+            cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
+            showHelp = true;
+        }
+    }
+
+    // make sure we have both input files
+    if (!haveBed) {
+        cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl;
+        showHelp = true;
+    }
+
+    if (!showHelp) {
+        BedCluster *bc = new BedCluster(bedFile, maxDistance, forceStrand);
+        delete bc;
+    }
+    else {
+        cluster_help();
+    }
+    return 0;
+}
+
+void cluster_help(void) {
+    
+    cerr << "\nTool:    bedtools cluster" << endl;
+    cerr << "Version: " << VERSION << "\n";        
+    cerr << "Summary: Clusters overlapping/nearby BED/GFF/VCF intervals." << endl << endl;
+
+    cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl;
+
+    cerr << "Options: " << endl;
+    cerr << "\t-s\t"                     << "Force strandedness.  That is, only merge features" << endl;
+    cerr                                 << "\t\tthat are the same strand." << endl;
+    cerr                                 << "\t\t- By default, merging is done without respect to strand." << endl << endl;
+
+    cerr << "\t-d\t"                     << "Maximum distance between features allowed for features" << endl;
+    cerr                                 << "\t\tto be merged." << endl;
+    cerr                                 << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl;
+    cerr                                 << "\t\t- (INTEGER)" << endl << endl;
+    
+    cerr << "Notes: " << endl;
+    cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl;
+    cerr << "\t    will in BED format with zero-based starts" << endl << endl;
+
+
+    // end the program here
+    exit(1);
+
+}
diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp
index 393eefea..6521e4a7 100644
--- a/src/mergeBed/mergeBed.cpp
+++ b/src/mergeBed/mergeBed.cpp
@@ -241,54 +241,47 @@ void BedMerge::ReportStranded(string chrom, int start, int end,
 // = Merge overlapping BED entries into a single entry =
 // =====================================================
 void BedMerge::MergeBed() {
-
-    // load the "B" bed file into a map so
-    // that we can easily compare "A" to it for overlaps
-    _bed->loadBedFileIntoMapNoBin();
-
-    // loop through each chromosome and merge their BED entries
-    masterBedMapNoBin::const_iterator m    = _bed->bedMapNoBin.begin();
-    masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
-    for (; m != mEnd; ++m) {
-
-        // bedList is already sorted by start position.
-        string chrom        = m->first;
-        vector<BED> bedList = m->second;
-        int mergeCount = 1;
-        vector<string> names;
-        vector<string> scores;
-        
-        // merge overlapping features for this chromosome.
-        int start = -1;
-        int end   = -1;
-        vector<BED>::const_iterator bedItr = bedList.begin();
-        vector<BED>::const_iterator bedEnd = bedList.end();
-        for (; bedItr != bedEnd; ++bedItr) {
-            // new block, no overlap
-            if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
-                if (start >= 0) {
-                    Report(chrom, start, end, names, scores, mergeCount);
-                    // reset
-                    mergeCount = 1;
-                    names.clear();
-                    scores.clear();
-                }
-                start = bedItr->start;
-                end   = bedItr->end;
-                if (!bedItr->name.empty())  names.push_back(bedItr->name);
-                if (!bedItr->score.empty()) scores.push_back(bedItr->score);
-            }
-            // same block, overlaps
-            else {
-                if ((int) bedItr-> end > end) end = bedItr->end;
-                mergeCount++;
-                if (!bedItr->name.empty())  names.push_back(bedItr->name);
-                if (!bedItr->score.empty()) scores.push_back(bedItr->score);
+    int mergeCount = 1;
+    vector<string> names;
+    vector<string> scores;
+    int start = -1;
+    int end   = -1;
+    BED prev, curr;
+    
+    _bed->Open();
+    while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals
+        if (_bed->_status != BED_VALID)
+            continue;            
+        // new block, no overlap
+        if ( (((int) curr.start - end) > _maxDistance) || (curr.chrom != prev.chrom)) {
+            if (start >= 0) {
+                Report(prev.chrom, start, end, names, scores, mergeCount);
+                // reset
+                mergeCount = 1;
+                names.clear();
+                scores.clear();
             }
+            start = curr.start;
+            end   = curr.end;
+            if (!curr.name.empty())
+                names.push_back(curr.name);
+            if (!curr.score.empty())
+            scores.push_back(curr.score);
         }
-        if (start >= 0) {
-            Report(chrom, start, end, names, scores, mergeCount);
+        // same block, overlaps
+        else {
+            if ((int) curr.end > end) 
+                end = curr.end;
+            if (!curr.name.empty())
+                names.push_back(curr.name);
+            if (!curr.score.empty())
+                scores.push_back(curr.score);
+            mergeCount++;
         }
+        prev = curr;
+    }
+    if (start >= 0) {
+        Report(prev.chrom, start, end, names, scores, mergeCount);
     }
 }
 
diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp
index 221bb220..d3eaa988 100644
--- a/src/utils/bedFile/bedFile.cpp
+++ b/src/utils/bedFile/bedFile.cpp
@@ -242,7 +242,11 @@ bool BedFile::GetNextBed(BED &bed, bool forceSorted) {
                     _prev_start = bed.start;
                 }
                 else if (forceSorted) {
-                    cerr << "ERROR: input file: (" << bedFile << ") is not sorted by chrom then start" << endl;
+                    cerr << "ERROR: input file: (" << bedFile 
+                         << ") is not sorted by chrom then start." << endl
+                         << "       The start coordinate at line " << _lineNum 
+                         << " is less than the start at line " << _lineNum-1 
+                         << endl;
                     exit(1);
                 }
             }
-- 
GitLab