From 017f6e61d5e692ceee68026201726b5338142357 Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Wed, 2 Nov 2011 13:01:50 -0400
Subject: [PATCH] Added new multiIntersectBed tool.  New relase to follow.

---
 Makefile                                      |   1 +
 src/multiIntersectBed/Makefile                |  49 +++
 src/multiIntersectBed/intervalItem.h          |  64 ++++
 src/multiIntersectBed/multiIntersectBed.cpp   | 256 +++++++++++++++
 src/multiIntersectBed/multiIntersectBed.h     | 123 ++++++++
 .../multiIntersectBedMain.cpp                 | 294 ++++++++++++++++++
 src/utils/bedFile/bedFile.cpp                 |  52 +++-
 src/utils/bedFile/bedFile.h                   |   7 +
 8 files changed, 845 insertions(+), 1 deletion(-)
 create mode 100644 src/multiIntersectBed/Makefile
 create mode 100644 src/multiIntersectBed/intervalItem.h
 create mode 100644 src/multiIntersectBed/multiIntersectBed.cpp
 create mode 100644 src/multiIntersectBed/multiIntersectBed.h
 create mode 100644 src/multiIntersectBed/multiIntersectBedMain.cpp

diff --git a/Makefile b/Makefile
index 2209af58..4e1b2af9 100644
--- a/Makefile
+++ b/Makefile
@@ -29,6 +29,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
 		  $(SRC_DIR)/maskFastaFromBed \
 		  $(SRC_DIR)/mergeBed \
 		  $(SRC_DIR)/multiBamCov \
+		  $(SRC_DIR)/multiIntersectBed \
 		  $(SRC_DIR)/nucBed \
 		  $(SRC_DIR)/overlap \
 		  $(SRC_DIR)/pairToBed \
diff --git a/src/multiIntersectBed/Makefile b/src/multiIntersectBed/Makefile
new file mode 100644
index 00000000..a076f3b4
--- /dev/null
+++ b/src/multiIntersectBed/Makefile
@@ -0,0 +1,49 @@
+UTILITIES_DIR = ../utils/
+OBJ_DIR = ../../obj/
+BIN_DIR = ../../bin/
+
+# -------------------
+# define our includes
+# -------------------
+INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
+	   -I$(UTILITIES_DIR)/lineFileUtilities/ \
+	   -I$(UTILITIES_DIR)/genomeFile/ \
+	   -I$(UTILITIES_DIR)/version/ \
+	   -I$(UTILITIES_DIR)/gzstream/ \
+	   -I$(UTILITIES_DIR)/fileType/ \
+	   -I$(UTILITIES_DIR)/BamTools/include
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= multiIntersectBed.cpp multiIntersectBedMain.cpp
+OBJECTS= $(SOURCES:.cpp=.o)
+_EXT_OBJECTS=bedFile.o genomeFile.o lineFileUtilities.o gzstream.o fileType.o
+EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+PROGRAM= multiIntersectBed
+
+all: $(PROGRAM)
+
+.PHONY: all
+
+$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
+	@echo "  * linking $(PROGRAM)"
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
+
+$(BUILT_OBJECTS): $(SOURCES)
+	@echo "  * compiling" $(*F).cpp
+	@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+$(EXT_OBJECTS):
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedGraphFile/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/genomeFile/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/	
+
+clean:
+	@echo "Cleaning up."
+	@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
+
+.PHONY: clean
diff --git a/src/multiIntersectBed/intervalItem.h b/src/multiIntersectBed/intervalItem.h
new file mode 100644
index 00000000..a63e0ce4
--- /dev/null
+++ b/src/multiIntersectBed/intervalItem.h
@@ -0,0 +1,64 @@
+/*****************************************************************************
+  intervalItem.h
+
+  (c) 2010 - Assaf Gordon
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#ifndef INTERVALITEM_H
+#define INTERVALITEM_H
+
+#include <string>
+#include <queue>
+
+enum COORDINATE_TYPE {
+    START,
+    END
+};
+
+/*
+   An interval item in the priority queue.
+
+   An IntervalItem can mark either a START position or an END position.
+ */
+class IntervalItem
+{
+
+
+public:
+    int source_index;           // which source BedGraph file this came from
+    COORDINATE_TYPE coord_type; // is this the start or the end position?
+    CHRPOS coord;
+
+    IntervalItem () :
+       source_index(-1),
+       coord_type(START),
+       coord(0)
+    {}
+    
+    IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord) :
+        source_index(_index),
+        coord_type(_type),
+        coord(_coord)
+    {}
+
+    IntervalItem(const IntervalItem &other) :
+        source_index(other.source_index),
+        coord_type(other.coord_type),
+        coord(other.coord)
+    {}
+
+    bool operator< ( const IntervalItem& other ) const
+    {
+        return this->coord > other.coord;
+    }
+};
+
+// our priority queue
+typedef std::priority_queue<IntervalItem> INTERVALS_PRIORITY_QUEUE;
+
+#endif
diff --git a/src/multiIntersectBed/multiIntersectBed.cpp b/src/multiIntersectBed/multiIntersectBed.cpp
new file mode 100644
index 00000000..f024df60
--- /dev/null
+++ b/src/multiIntersectBed/multiIntersectBed.cpp
@@ -0,0 +1,256 @@
+/*****************************************************************************
+  unionBedGraphs.cpp
+
+  (c) 2010 - Assaf Gordon, CSHL
+           - Aaron Quinlan, UVA
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include <cassert>
+#include <cstring>
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+
+#include "bedFile.h"
+#include "multiIntersectBed.h"
+
+using namespace std;
+
+
+MultiIntersectBed::MultiIntersectBed(std::ostream& _output,
+                            const vector<string>& _filenames,
+                            const vector<string>& _titles,
+                            bool _print_empty_regions,
+                            const std::string& _genome_size_filename,
+                            const std::string& _no_coverage_value   ) :
+    filenames(_filenames),
+    titles(_titles),
+    output(_output),
+    current_non_zero_inputs(0),
+    print_empty_regions(_print_empty_regions),
+    genome_sizes(NULL),
+    no_coverage_value(_no_coverage_value)
+{
+    if (print_empty_regions) {
+        assert(!_genome_size_filename.empty());
+
+        genome_sizes = new GenomeFile(_genome_size_filename);
+    }
+}
+
+
+MultiIntersectBed::~MultiIntersectBed() {
+    CloseFiles();
+    if (genome_sizes) {
+        delete genome_sizes;
+        genome_sizes = NULL ;
+    }
+}
+
+
+void MultiIntersectBed::MultiIntersect() {
+    OpenFiles();
+
+    // Add the first interval from each file
+    for(size_t i = 0;i < input_files.size(); ++i)
+        LoadNextItem(i);
+
+    // Chromosome loop - once per chromosome
+    do {
+        // Find the first chromosome to use
+        current_chrom = DetermineNextChrom();
+
+        // Populate the queue with initial values from all files
+        // (if they belong to the correct chromosome)
+        for(size_t i = 0; i < input_files.size(); ++i)
+            AddInterval(i);
+
+        CHRPOS current_start = ConsumeNextCoordinate();
+
+        // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage
+        if (print_empty_regions && current_start > 0)
+            PrintEmptyCoverage(0,current_start);
+
+        // Intervals loop - until all intervals (of current chromosome) from all files are used.
+        do {
+            CHRPOS current_end = queue.top().coord;
+            PrintCoverage(current_start, current_end);
+            current_start = ConsumeNextCoordinate();
+        } while (!queue.empty());
+
+        // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome
+            // print a dummy empty coverage
+        if (print_empty_regions) {
+            CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
+            if (current_start < chrom_size)
+                PrintEmptyCoverage(current_start, chrom_size);
+        }
+
+    } while (!AllFilesDone());
+}
+
+
+CHRPOS MultiIntersectBed::ConsumeNextCoordinate() {
+    assert(!queue.empty());
+
+    CHRPOS new_position = queue.top().coord;
+    do {
+        IntervalItem item = queue.top();
+        UpdateInformation(item);
+        queue.pop();
+    } while (!queue.empty() && queue.top().coord == new_position);
+
+    return new_position;
+}
+
+
+void MultiIntersectBed::UpdateInformation(const IntervalItem &item) {
+    // Update the depth coverage for this file
+
+    // Which coordinate is it - start or end?
+    switch (item.coord_type)
+    {
+    case START:
+        current_depth[item.source_index] = 1;
+        current_non_zero_inputs++;
+        break;
+    case END:
+        //Read the next interval from this file
+        AddInterval(item.source_index);
+        current_depth[item.source_index] = 0;
+        current_non_zero_inputs--;
+        break;
+    default:
+        assert(0);
+    }
+}
+
+
+void MultiIntersectBed::AddInterval(int index) {
+    assert(static_cast<unsigned int>(index) < input_files.size());
+
+    //This file has no more intervals
+    if (current_item[index].chrom.empty())
+        return;
+
+    //If the next interval belongs to a different chrom, don't add it
+    if (current_item[index].chrom!=current_chrom)
+        return;
+
+    const BED &bed(current_item[index]);
+
+    IntervalItem start_item(index, START, bed.start);
+    IntervalItem end_item(index, END, bed.end);
+
+    queue.push(start_item);
+    queue.push(end_item);
+
+    LoadNextItem(index);
+}
+
+
+void MultiIntersectBed::PrintHeader() {
+    output << "chrom\tstart\tend" ;
+    for (size_t i=0;i<titles.size();++i)
+        output << "\t" <<titles[i];
+    output << endl;
+}
+
+
+void MultiIntersectBed::PrintCoverage(CHRPOS start, CHRPOS end) {
+    if ( current_non_zero_inputs == 0 && ! print_empty_regions )
+        return ;
+
+    output << current_chrom << "\t"
+        << start << "\t"
+        << end;
+        
+    for (size_t i=0;i<current_depth.size();++i)
+        output << "\t" << current_depth[i] ;
+
+    output << endl;
+}
+
+
+void MultiIntersectBed::PrintEmptyCoverage(CHRPOS start, CHRPOS end) {
+    output << current_chrom << "\t"
+        << start << "\t"
+        << end;
+        
+    for (size_t i=0;i<current_depth.size();++i)
+        output << "\t0";
+    
+    output << endl;
+}
+
+
+void MultiIntersectBed::LoadNextItem(int index) {
+    assert(static_cast<unsigned int>(index) < input_files.size());
+
+    current_item[index].chrom="";
+
+    BedFile *file = input_files[index];
+    BED merged_bed;
+    int lineNum = 0;
+    //
+    // TO DO: Do the mergeing on the fly.  How best to do this?
+    // 
+    // IDEA: Implement a Merge class with GetNextMerge element.
+    //
+
+    while (file->GetNextMergedBed(merged_bed, lineNum))
+    {
+        current_item[index] = merged_bed;
+        break;
+    }
+}
+
+
+bool MultiIntersectBed::AllFilesDone() {
+    for (size_t i=0;i<current_item.size();++i)
+        if (!current_item[i].chrom.empty())
+            return false;
+    return true;
+}
+
+
+string MultiIntersectBed::DetermineNextChrom() {
+    string next_chrom;
+    for (size_t i=0;i<current_item.size();++i) {
+        if (current_item[i].chrom.empty())
+            continue;
+
+        if (next_chrom.empty())
+            next_chrom = current_item[i].chrom;
+        else
+            if (current_item[i].chrom < next_chrom)
+                next_chrom = current_item[i].chrom ;
+    }
+    return next_chrom;
+}
+
+
+void MultiIntersectBed::OpenFiles() {
+    for (size_t i = 0; i < filenames.size(); ++i) {
+        BedFile *file = new BedFile(filenames[i]);
+        file->Open();
+        input_files.push_back(file);
+        current_depth.push_back(0);
+    }
+    current_item.resize(filenames.size());
+}
+
+
+void MultiIntersectBed::CloseFiles() {
+    for (size_t i=0; i < input_files.size(); ++i) {
+        BedFile *file = input_files[i];
+        delete file;
+        input_files[i] = NULL ;
+    }
+    input_files.clear();
+}
diff --git a/src/multiIntersectBed/multiIntersectBed.h b/src/multiIntersectBed/multiIntersectBed.h
new file mode 100644
index 00000000..5c3112d5
--- /dev/null
+++ b/src/multiIntersectBed/multiIntersectBed.h
@@ -0,0 +1,123 @@
+/*****************************************************************************
+  multiIntersectBed.h
+
+  (c) 2010 - Aaron Quinlan, UVA
+           - Assaf Gordon, CSHL
+  Quinlan Laboratory
+  Department of Public Health Sciences
+  Center for Public Health Genomics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#ifndef MULTIINTERSECTBED_H
+#define MULTIINTERSECTBED_H
+
+#include <vector>
+#include <string>
+#include "bedFile.h"
+#include "genomeFile.h"
+#include "intervalItem.h"
+
+class MultiIntersectBed
+{
+private:
+
+    vector<string>  filenames;
+    vector<string>  titles;
+
+    vector<BedFile*>   input_files;
+    vector<int>        current_depth;
+    vector<BED>        current_item;
+
+    std::ostream    &output;
+
+    INTERVALS_PRIORITY_QUEUE queue;
+    std::string              current_chrom;
+    int                      current_non_zero_inputs;
+    bool                     print_empty_regions;
+
+    GenomeFile* genome_sizes;
+
+    std::string no_coverage_value;
+
+public:
+    MultiIntersectBed(std::ostream& _output,
+            const vector<string>& _filenames,
+            const vector<string>& _titles,
+            bool _print_empty_regions,
+            const std::string& _genomeFileName,
+            const std::string& _no_coverage_value);
+
+    virtual ~MultiIntersectBed();
+
+    // Combines all interval files
+    void MultiIntersect();
+
+    // Print the header line: chrom/start/end + name of each bedgraph file.
+    void PrintHeader();
+
+
+private:
+
+    // Open all input files, initialize "current_XXX" vectors
+    void OpenFiles();
+
+    // Close the input files.
+    void CloseFiles();
+
+    /*
+       Add an interval from BedGraph file 'index' into the queue.
+       will only be added if it belongs to the current chromosome.
+
+       If the interval was added (=consumed), the next interval will be read from the file
+       using 'LoadNextItem'
+     */
+    void AddInterval(int index);
+
+    /*
+       Loads the next interval from Bed file 'index'.
+       Stores it in 'current_bed_item' vector.
+     */
+    void LoadNextItem(int index);
+
+    /*
+       Scans the 'current_bedgraph_item' vector,
+       find the 'first' chromosome to use (different BedGraph files can start with different chromosomes).
+     */
+    std::string DetermineNextChrom();
+
+    /*
+       Returns 'true' if ALL intervals from ALL BedGraph files were used
+    */
+    bool        AllFilesDone();
+
+    /*
+       Extract the next coordinate from the queue, and updates the current coverage information.
+       If multiple interval share the same coordinate values, all of them are handled.
+       If an END coordinate is consumed, the next interval (from the corresponding file) is read.
+     */
+    CHRPOS ConsumeNextCoordinate();
+
+    /*
+       Updates the coverage information based on the given item.
+       Item can be a START coordinate or an END coordiante.
+     */
+    void UpdateInformation(const IntervalItem &item);
+
+    /*
+       prints chrom/start/end and the current depth coverage values of all the files.
+     */
+    void PrintCoverage(CHRPOS start, CHRPOS end);
+
+    /*
+       prints chrom/start/end and the ZERO depth coverage values of all the files.
+     */
+    void PrintEmptyCoverage(CHRPOS start, CHRPOS end);
+
+    void DebugPrintQueue();
+};
+
+
+#endif
diff --git a/src/multiIntersectBed/multiIntersectBedMain.cpp b/src/multiIntersectBed/multiIntersectBedMain.cpp
new file mode 100644
index 00000000..0970f23d
--- /dev/null
+++ b/src/multiIntersectBed/multiIntersectBedMain.cpp
@@ -0,0 +1,294 @@
+/*****************************************************************************
+  unionBedGraphsMain.cpp
+
+  (c) 2010 - Assaf Gordon, CSHL
+           - Aaron Quinlan, UVA
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include <climits>
+#include <cstring>
+#include <cstdlib>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <getopt.h>
+#include <libgen.h> //for basename()
+#include "version.h"
+
+#include "genomeFile.h"
+#include "MultiIntersectBed.h"
+
+using namespace std;
+
+// define our program name
+#define PROGRAM_NAME "multiIntersectBed"
+
+// define our parameter checking macro
+#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
+
+//STLized version of basename()
+// (because POSIX basename() modifies the input string pointer)
+// Additionally: removes any extension the basename might have.
+std::string stl_basename(const std::string& path);
+
+// function declarations
+void ShowHelp(void);
+void ShowExamples(void);
+
+
+int main(int argc, char* argv[])
+{
+    bool haveFiles         = false;
+    bool haveTitles        = false;
+    bool haveGenome        = false;
+    bool haveFiller        = true;
+    bool printHeader       = false;
+    bool printEmptyRegions = false;
+    bool showHelp          = false;
+    string genomeFile;
+    string basePath;
+    string noCoverageValue = "0";
+    vector<string> inputFiles;
+    vector<string> inputTitles;
+
+    //Parse command line options
+    if(argc <= 1)
+        ShowHelp();
+
+    for(int i = 1; i < argc; i++) {
+        int parameterLength = (int)strlen(argv[i]);
+
+        if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
+        (PARAMETER_CHECK("--help", 5, parameterLength))) {
+            showHelp = true;
+        }
+    }
+
+    if(showHelp == true) {
+        ShowHelp();
+        exit(1);
+    }
+
+    // do some parsing (all of these parameters require 2 strings)
+    for(int i = 1; i < argc; i++) {
+
+        int parameterLength = (int)strlen(argv[i]);
+
+        if(PARAMETER_CHECK("-i", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                haveFiles = true;
+                i = i+1;
+                string file = argv[i];
+                while (file[0] != '-' && i < argc) {
+                    inputFiles.push_back(file);
+                    i++;
+                    if (i < argc)
+                        file = argv[i];
+                }
+                i--;
+            }
+        }
+        else if(PARAMETER_CHECK("-names", 6, parameterLength)) {
+            if ((i+1) < argc) {
+                haveTitles = true;
+                i = i+1;
+                string title = argv[i];
+                while (title[0] != '-' && i < argc) {
+                    inputTitles.push_back(title);
+                    i++;
+                    if (i < argc)
+                        title = argv[i];
+                }
+                i--;
+            }
+        }
+        else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                haveGenome = true;
+                genomeFile = argv[i + 1];
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-filler", 7, parameterLength)) {
+            if ((i+1) < argc) {
+                haveFiller      = true;
+                noCoverageValue = argv[i + 1];
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-header", 7, parameterLength)) {
+            printHeader = true;
+        }
+        else if(PARAMETER_CHECK("-empty", 6, parameterLength)) {
+            printEmptyRegions = true;
+        }
+        else if(PARAMETER_CHECK("-examples", 9, parameterLength)) {
+            ShowHelp();
+            ShowExamples();
+            exit(1);
+        }
+    }
+
+    //Sanity checks
+    if (inputFiles.empty() == true) {
+        cerr << "Error: missing BedGraph file names (-i) to combine." << endl;
+        exit(1);
+    }
+    if (inputFiles.size() == 1) {
+        cerr << "Error: Only a single BedGraph file was specified. Nothing to combine, exiting." << endl;
+        exit(1);
+    }
+    if (printEmptyRegions && (genomeFile.empty() == true)) {
+        cerr << "Error: when using -empty, the genome sizes file (-g) must be specified using '-g FILE'." << endl;
+        exit(1);
+    }
+    if ((haveTitles == true) && (inputFiles.size() != inputTitles.size())) {
+        cerr << "Error: The number of file titles (-names) does not match the number of files (-i)." << endl;
+        exit(1);
+    }
+
+    MultiIntersectBed mbi(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue);
+    if (printHeader)
+        mbi.PrintHeader();
+    mbi.MultiIntersect();
+}
+
+void ShowHelp(void) {
+
+    cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
+
+    cerr << "Authors: Assaf Gordon, CSHL" << endl;
+    cerr << "         Aaron Quinlan (aaronquinlan@gmail.com)" << endl << endl;
+
+    cerr << "Summary: Combines multiple BedGraph files into a single file," << endl;
+    cerr << "\t allowing coverage comparisons between them." << endl << endl;
+
+    cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -i FILE1 FILE2 .. FILEn" << endl;
+    cerr << "\t Assumes that each BedGraph file is sorted by chrom/start " << endl;
+    cerr << "\t and that the intervals in each are non-overlapping." << endl << endl;
+
+    cerr << "Options: " << endl;
+
+    cerr << "\t-header\t\t"     << "Print a header line." << endl;
+    cerr                        << "\t\t\t(chrom/start/end + names of each file)." << endl << endl;
+
+    cerr << "\t-names\t\t"      << "A list of names (one / file) to describe each file in -i." << endl;
+    cerr                        << "\t\t\tThese names will be printed in the header line." << endl << endl;
+
+    cerr << "\t-g\t\t"          << "Use genome file to calculate empty regions." << endl;
+    cerr                        << "\t\t\t- STRING." << endl << endl;
+
+    cerr << "\t-empty\t\t"      << "Report empty regions (i.e., start/end intervals w/o" << endl;
+    cerr                        << "\t\t\tvalues in all files)." << endl;
+    cerr                        << "\t\t\t- Requires the '-g FILE' parameter.\n" << endl;
+
+    cerr << "\t-filler TEXT\t"  << "Use TEXT when representing intervals having no value." << endl;
+    cerr                        << "\t\t\t- Default is '0', but you can use 'N/A' or any other text." << endl << endl;
+
+    cerr << "\t-examples\t"     << "Show detailed usage examples." << endl << endl;
+}
+
+
+
+void ShowExamples()
+{
+    cerr << "Example usage:\n\n"  \
+"== Input files: ==\n" \
+"\n" \
+" $ cat 1.bg\n" \
+" chr1  1000    1500    10\n" \
+" chr1  2000    2100    20\n" \
+"\n" \
+" $ cat 2.bg\n" \
+" chr1  900 1600    60\n" \
+" chr1  1700    2050    50\n" \
+"\n" \
+" $ cat 3.bg\n" \
+" chr1  1980    2070    80\n" \
+" chr1  2090    2100    20\n" \
+"\n" \
+" $ cat sizes.txt\n" \
+" chr1  5000\n" \
+"\n" \
+"== Union/combine the files: ==\n" \
+"\n" \
+" $ unionBedGraphs -i 1.bg 2.bg 3.bg\n" \
+" chr1  900 1000    0   60  0\n" \
+" chr1  1000    1500    10  60  0\n" \
+" chr1  1500    1600    0   60  0\n" \
+" chr1  1700    1980    0   50  0\n" \
+" chr1  1980    2000    0   50  80\n" \
+" chr1  2000    2050    20  50  80\n" \
+" chr1  2050    2070    20  0   80\n" \
+" chr1  2070    2090    20  0   0\n" \
+" chr1  2090    2100    20  0   20\n" \
+"\n" \
+"== Union/combine the files, with a header line (titles are the file names): ==\n" \
+"\n" \
+" $ unionBedGraphs -header -i 1.bg 2.bg 3.bg\n" \
+" chrom start   end 1   2   3\n" \
+" chr1  900 1000    0   60  0\n" \
+" chr1  1000    1500    10  60  0\n" \
+" chr1  1500    1600    0   60  0\n" \
+" chr1  1700    1980    0   50  0\n" \
+" chr1  1980    2000    0   50  80\n" \
+" chr1  2000    2050    20  50  80\n" \
+" chr1  2050    2070    20  0   80\n" \
+" chr1  2070    2090    20  0   0\n" \
+" chr1  2090    2100    20  0   20\n" \
+"\n" \
+"== Union/combine the files, with a header line and custom names: ==\n" \
+"\n" \
+" $ unionBedGraphs -header -i 1.bg 2.bg 3.bg -names WT-1 WT-2 KO-1\n" \
+" chrom start   end WT-1    WT-2    KO-1\n" \
+" chr1  900 1000    0   60  0\n" \
+" chr1  1000    1500    10  60  0\n" \
+" chr1  1500    1600    0   60  0\n" \
+" chr1  1700    1980    0   50  0\n" \
+" chr1  1980    2000    0   50  80\n" \
+" chr1  2000    2050    20  50  80\n" \
+" chr1  2050    2070    20  0   80\n" \
+" chr1  2070    2090    20  0   0\n" \
+" chr1  2090    2100    20  0   20\n" \
+"\n" \
+"== Union/combine, showing empty regions (note, requires -g): ==\n" \
+"\n" \
+" $ unionBedGraphs -header -empty -g sizes.TXT -i 1.bg 2.bg 3.bg\n" \
+" chrom start   end 1   2   3\n" \
+" chr1  0   900 0   0   0\n" \
+" chr1  900 1000    0   60  0\n" \
+" chr1  1000    1500    10  60  0\n" \
+" chr1  1500    1600    0   60  0\n" \
+" chr1  1600    1700    0   0   0\n" \
+" chr1  1700    1980    0   50  0\n" \
+" chr1  1980    2000    0   50  80\n" \
+" chr1  2000    2050    20  50  80\n" \
+" chr1  2050    2070    20  0   80\n" \
+" chr1  2070    2090    20  0   0\n" \
+" chr1  2090    2100    20  0   20\n" \
+" chr1  2100    5000    0   0   0\n" \
+"\n" \
+;
+}
+
+std::string stl_basename(const std::string& path)
+{
+    string result;
+
+    char* path_dup = strdup(path.c_str());
+    char* basename_part = basename(path_dup);
+    result = basename_part;
+    free(path_dup);
+
+    size_t pos = result.find_last_of('.');
+    if (pos != string::npos )
+        result = result.substr(0,pos);
+
+    return result;
+}
+
diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp
index e225541b..ec51e398 100644
--- a/src/utils/bedFile/bedFile.cpp
+++ b/src/utils/bedFile/bedFile.cpp
@@ -119,7 +119,10 @@ BedFile::BedFile(string &bedFile)
 : bedFile(bedFile),
   _isGff(false),
   _isVcf(false),
-  _typeIsKnown(false)
+  _typeIsKnown(false),
+  _merged_start(-1),
+  _merged_end(-1),
+  _merged_chrom("")
 {}
 
 // Destructor
@@ -193,6 +196,53 @@ BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum) {
 }
 
 
+bool BedFile::GetNextMergedBed(BED &merged_bed, int &lineNum) {
+
+    if (_bedStream->good()) {
+        BED bed;
+        BedLineStatus bedStatus;
+        while ((bedStatus = GetNextBed(bed, lineNum)) != BED_INVALID) {
+            if (bedStatus == BED_VALID) {
+                if (((int) bed.start - _merged_end > 0) || 
+                   (_merged_end < 0) || 
+                   (bed.chrom != _merged_chrom))
+                {
+                    if (_merged_start >= 0) {
+                        merged_bed.chrom = _merged_chrom;
+                        merged_bed.start = _merged_start;
+                        merged_bed.end   = _merged_end;
+                        
+                        _merged_chrom = bed.chrom;
+                        _merged_start = bed.start;
+                        _merged_end   = bed.end;
+                
+                        return true;
+                    }
+                    else {
+                        _merged_start = bed.start;
+                        _merged_chrom = bed.chrom;
+                        _merged_end = bed.end;
+                    }
+                }
+                else if ((int) bed.end > _merged_end) 
+                {
+                    _merged_end = bed.end;
+                }
+            }
+        }
+        // handle the last merged block in the file.
+        if (bedStatus == BED_INVALID)
+        {
+            merged_bed.chrom = _merged_chrom;
+            merged_bed.start = _merged_start;
+            merged_bed.end   = _merged_end;
+            return true;
+        }
+    }
+    return false;
+}
+
+
 void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end,
                                  string strand, vector<BED> &hits, bool sameStrand, bool diffStrand) {
 
diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h
index 95deb922..61b4eb00 100644
--- a/src/utils/bedFile/bedFile.h
+++ b/src/utils/bedFile/bedFile.h
@@ -421,6 +421,10 @@ public:
 
     // Get the next BED entry in an opened BED file.
     BedLineStatus GetNextBed (BED &bed, int &lineNum);
+    
+    // Returns the next MERGED (i.e., non-overlapping) interval in an opened BED file
+    // NOTE: assumes input file is sorted by chrom then start
+    bool GetNextMergedBed(BED &merged_bed, int &lineNum);
 
     // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs
     void loadBedFileIntoMap();
@@ -487,6 +491,9 @@ private:
     istream   *_bedStream;
     string _bedLine;
     vector<string> _bedFields;
+    int _merged_start;
+    int _merged_end;
+    string _merged_chrom;
 
     void setZeroBased(bool zeroBased);
     void setGff (bool isGff);
-- 
GitLab