From 017f6e61d5e692ceee68026201726b5338142357 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Wed, 2 Nov 2011 13:01:50 -0400 Subject: [PATCH] Added new multiIntersectBed tool. New relase to follow. --- Makefile | 1 + src/multiIntersectBed/Makefile | 49 +++ src/multiIntersectBed/intervalItem.h | 64 ++++ src/multiIntersectBed/multiIntersectBed.cpp | 256 +++++++++++++++ src/multiIntersectBed/multiIntersectBed.h | 123 ++++++++ .../multiIntersectBedMain.cpp | 294 ++++++++++++++++++ src/utils/bedFile/bedFile.cpp | 52 +++- src/utils/bedFile/bedFile.h | 7 + 8 files changed, 845 insertions(+), 1 deletion(-) create mode 100644 src/multiIntersectBed/Makefile create mode 100644 src/multiIntersectBed/intervalItem.h create mode 100644 src/multiIntersectBed/multiIntersectBed.cpp create mode 100644 src/multiIntersectBed/multiIntersectBed.h create mode 100644 src/multiIntersectBed/multiIntersectBedMain.cpp diff --git a/Makefile b/Makefile index 2209af58..4e1b2af9 100644 --- a/Makefile +++ b/Makefile @@ -29,6 +29,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/maskFastaFromBed \ $(SRC_DIR)/mergeBed \ $(SRC_DIR)/multiBamCov \ + $(SRC_DIR)/multiIntersectBed \ $(SRC_DIR)/nucBed \ $(SRC_DIR)/overlap \ $(SRC_DIR)/pairToBed \ diff --git a/src/multiIntersectBed/Makefile b/src/multiIntersectBed/Makefile new file mode 100644 index 00000000..a076f3b4 --- /dev/null +++ b/src/multiIntersectBed/Makefile @@ -0,0 +1,49 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= multiIntersectBed.cpp multiIntersectBedMain.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o genomeFile.o lineFileUtilities.o gzstream.o fileType.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= multiIntersectBed + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedGraphFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/genomeFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/multiIntersectBed/intervalItem.h b/src/multiIntersectBed/intervalItem.h new file mode 100644 index 00000000..a63e0ce4 --- /dev/null +++ b/src/multiIntersectBed/intervalItem.h @@ -0,0 +1,64 @@ +/***************************************************************************** + intervalItem.h + + (c) 2010 - Assaf Gordon + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef INTERVALITEM_H +#define INTERVALITEM_H + +#include <string> +#include <queue> + +enum COORDINATE_TYPE { + START, + END +}; + +/* + An interval item in the priority queue. + + An IntervalItem can mark either a START position or an END position. + */ +class IntervalItem +{ + + +public: + int source_index; // which source BedGraph file this came from + COORDINATE_TYPE coord_type; // is this the start or the end position? + CHRPOS coord; + + IntervalItem () : + source_index(-1), + coord_type(START), + coord(0) + {} + + IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord) : + source_index(_index), + coord_type(_type), + coord(_coord) + {} + + IntervalItem(const IntervalItem &other) : + source_index(other.source_index), + coord_type(other.coord_type), + coord(other.coord) + {} + + bool operator< ( const IntervalItem& other ) const + { + return this->coord > other.coord; + } +}; + +// our priority queue +typedef std::priority_queue<IntervalItem> INTERVALS_PRIORITY_QUEUE; + +#endif diff --git a/src/multiIntersectBed/multiIntersectBed.cpp b/src/multiIntersectBed/multiIntersectBed.cpp new file mode 100644 index 00000000..f024df60 --- /dev/null +++ b/src/multiIntersectBed/multiIntersectBed.cpp @@ -0,0 +1,256 @@ +/***************************************************************************** + unionBedGraphs.cpp + + (c) 2010 - Assaf Gordon, CSHL + - Aaron Quinlan, UVA + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include <cassert> +#include <cstring> +#include <cstdlib> +#include <iostream> +#include <algorithm> + +#include "bedFile.h" +#include "multiIntersectBed.h" + +using namespace std; + + +MultiIntersectBed::MultiIntersectBed(std::ostream& _output, + const vector<string>& _filenames, + const vector<string>& _titles, + bool _print_empty_regions, + const std::string& _genome_size_filename, + const std::string& _no_coverage_value ) : + filenames(_filenames), + titles(_titles), + output(_output), + current_non_zero_inputs(0), + print_empty_regions(_print_empty_regions), + genome_sizes(NULL), + no_coverage_value(_no_coverage_value) +{ + if (print_empty_regions) { + assert(!_genome_size_filename.empty()); + + genome_sizes = new GenomeFile(_genome_size_filename); + } +} + + +MultiIntersectBed::~MultiIntersectBed() { + CloseFiles(); + if (genome_sizes) { + delete genome_sizes; + genome_sizes = NULL ; + } +} + + +void MultiIntersectBed::MultiIntersect() { + OpenFiles(); + + // Add the first interval from each file + for(size_t i = 0;i < input_files.size(); ++i) + LoadNextItem(i); + + // Chromosome loop - once per chromosome + do { + // Find the first chromosome to use + current_chrom = DetermineNextChrom(); + + // Populate the queue with initial values from all files + // (if they belong to the correct chromosome) + for(size_t i = 0; i < input_files.size(); ++i) + AddInterval(i); + + CHRPOS current_start = ConsumeNextCoordinate(); + + // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage + if (print_empty_regions && current_start > 0) + PrintEmptyCoverage(0,current_start); + + // Intervals loop - until all intervals (of current chromosome) from all files are used. + do { + CHRPOS current_end = queue.top().coord; + PrintCoverage(current_start, current_end); + current_start = ConsumeNextCoordinate(); + } while (!queue.empty()); + + // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome + // print a dummy empty coverage + if (print_empty_regions) { + CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); + if (current_start < chrom_size) + PrintEmptyCoverage(current_start, chrom_size); + } + + } while (!AllFilesDone()); +} + + +CHRPOS MultiIntersectBed::ConsumeNextCoordinate() { + assert(!queue.empty()); + + CHRPOS new_position = queue.top().coord; + do { + IntervalItem item = queue.top(); + UpdateInformation(item); + queue.pop(); + } while (!queue.empty() && queue.top().coord == new_position); + + return new_position; +} + + +void MultiIntersectBed::UpdateInformation(const IntervalItem &item) { + // Update the depth coverage for this file + + // Which coordinate is it - start or end? + switch (item.coord_type) + { + case START: + current_depth[item.source_index] = 1; + current_non_zero_inputs++; + break; + case END: + //Read the next interval from this file + AddInterval(item.source_index); + current_depth[item.source_index] = 0; + current_non_zero_inputs--; + break; + default: + assert(0); + } +} + + +void MultiIntersectBed::AddInterval(int index) { + assert(static_cast<unsigned int>(index) < input_files.size()); + + //This file has no more intervals + if (current_item[index].chrom.empty()) + return; + + //If the next interval belongs to a different chrom, don't add it + if (current_item[index].chrom!=current_chrom) + return; + + const BED &bed(current_item[index]); + + IntervalItem start_item(index, START, bed.start); + IntervalItem end_item(index, END, bed.end); + + queue.push(start_item); + queue.push(end_item); + + LoadNextItem(index); +} + + +void MultiIntersectBed::PrintHeader() { + output << "chrom\tstart\tend" ; + for (size_t i=0;i<titles.size();++i) + output << "\t" <<titles[i]; + output << endl; +} + + +void MultiIntersectBed::PrintCoverage(CHRPOS start, CHRPOS end) { + if ( current_non_zero_inputs == 0 && ! print_empty_regions ) + return ; + + output << current_chrom << "\t" + << start << "\t" + << end; + + for (size_t i=0;i<current_depth.size();++i) + output << "\t" << current_depth[i] ; + + output << endl; +} + + +void MultiIntersectBed::PrintEmptyCoverage(CHRPOS start, CHRPOS end) { + output << current_chrom << "\t" + << start << "\t" + << end; + + for (size_t i=0;i<current_depth.size();++i) + output << "\t0"; + + output << endl; +} + + +void MultiIntersectBed::LoadNextItem(int index) { + assert(static_cast<unsigned int>(index) < input_files.size()); + + current_item[index].chrom=""; + + BedFile *file = input_files[index]; + BED merged_bed; + int lineNum = 0; + // + // TO DO: Do the mergeing on the fly. How best to do this? + // + // IDEA: Implement a Merge class with GetNextMerge element. + // + + while (file->GetNextMergedBed(merged_bed, lineNum)) + { + current_item[index] = merged_bed; + break; + } +} + + +bool MultiIntersectBed::AllFilesDone() { + for (size_t i=0;i<current_item.size();++i) + if (!current_item[i].chrom.empty()) + return false; + return true; +} + + +string MultiIntersectBed::DetermineNextChrom() { + string next_chrom; + for (size_t i=0;i<current_item.size();++i) { + if (current_item[i].chrom.empty()) + continue; + + if (next_chrom.empty()) + next_chrom = current_item[i].chrom; + else + if (current_item[i].chrom < next_chrom) + next_chrom = current_item[i].chrom ; + } + return next_chrom; +} + + +void MultiIntersectBed::OpenFiles() { + for (size_t i = 0; i < filenames.size(); ++i) { + BedFile *file = new BedFile(filenames[i]); + file->Open(); + input_files.push_back(file); + current_depth.push_back(0); + } + current_item.resize(filenames.size()); +} + + +void MultiIntersectBed::CloseFiles() { + for (size_t i=0; i < input_files.size(); ++i) { + BedFile *file = input_files[i]; + delete file; + input_files[i] = NULL ; + } + input_files.clear(); +} diff --git a/src/multiIntersectBed/multiIntersectBed.h b/src/multiIntersectBed/multiIntersectBed.h new file mode 100644 index 00000000..5c3112d5 --- /dev/null +++ b/src/multiIntersectBed/multiIntersectBed.h @@ -0,0 +1,123 @@ +/***************************************************************************** + multiIntersectBed.h + + (c) 2010 - Aaron Quinlan, UVA + - Assaf Gordon, CSHL + Quinlan Laboratory + Department of Public Health Sciences + Center for Public Health Genomics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef MULTIINTERSECTBED_H +#define MULTIINTERSECTBED_H + +#include <vector> +#include <string> +#include "bedFile.h" +#include "genomeFile.h" +#include "intervalItem.h" + +class MultiIntersectBed +{ +private: + + vector<string> filenames; + vector<string> titles; + + vector<BedFile*> input_files; + vector<int> current_depth; + vector<BED> current_item; + + std::ostream &output; + + INTERVALS_PRIORITY_QUEUE queue; + std::string current_chrom; + int current_non_zero_inputs; + bool print_empty_regions; + + GenomeFile* genome_sizes; + + std::string no_coverage_value; + +public: + MultiIntersectBed(std::ostream& _output, + const vector<string>& _filenames, + const vector<string>& _titles, + bool _print_empty_regions, + const std::string& _genomeFileName, + const std::string& _no_coverage_value); + + virtual ~MultiIntersectBed(); + + // Combines all interval files + void MultiIntersect(); + + // Print the header line: chrom/start/end + name of each bedgraph file. + void PrintHeader(); + + +private: + + // Open all input files, initialize "current_XXX" vectors + void OpenFiles(); + + // Close the input files. + void CloseFiles(); + + /* + Add an interval from BedGraph file 'index' into the queue. + will only be added if it belongs to the current chromosome. + + If the interval was added (=consumed), the next interval will be read from the file + using 'LoadNextItem' + */ + void AddInterval(int index); + + /* + Loads the next interval from Bed file 'index'. + Stores it in 'current_bed_item' vector. + */ + void LoadNextItem(int index); + + /* + Scans the 'current_bedgraph_item' vector, + find the 'first' chromosome to use (different BedGraph files can start with different chromosomes). + */ + std::string DetermineNextChrom(); + + /* + Returns 'true' if ALL intervals from ALL BedGraph files were used + */ + bool AllFilesDone(); + + /* + Extract the next coordinate from the queue, and updates the current coverage information. + If multiple interval share the same coordinate values, all of them are handled. + If an END coordinate is consumed, the next interval (from the corresponding file) is read. + */ + CHRPOS ConsumeNextCoordinate(); + + /* + Updates the coverage information based on the given item. + Item can be a START coordinate or an END coordiante. + */ + void UpdateInformation(const IntervalItem &item); + + /* + prints chrom/start/end and the current depth coverage values of all the files. + */ + void PrintCoverage(CHRPOS start, CHRPOS end); + + /* + prints chrom/start/end and the ZERO depth coverage values of all the files. + */ + void PrintEmptyCoverage(CHRPOS start, CHRPOS end); + + void DebugPrintQueue(); +}; + + +#endif diff --git a/src/multiIntersectBed/multiIntersectBedMain.cpp b/src/multiIntersectBed/multiIntersectBedMain.cpp new file mode 100644 index 00000000..0970f23d --- /dev/null +++ b/src/multiIntersectBed/multiIntersectBedMain.cpp @@ -0,0 +1,294 @@ +/***************************************************************************** + unionBedGraphsMain.cpp + + (c) 2010 - Assaf Gordon, CSHL + - Aaron Quinlan, UVA + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include <climits> +#include <cstring> +#include <cstdlib> +#include <vector> +#include <string> +#include <iostream> +#include <getopt.h> +#include <libgen.h> //for basename() +#include "version.h" + +#include "genomeFile.h" +#include "MultiIntersectBed.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "multiIntersectBed" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +//STLized version of basename() +// (because POSIX basename() modifies the input string pointer) +// Additionally: removes any extension the basename might have. +std::string stl_basename(const std::string& path); + +// function declarations +void ShowHelp(void); +void ShowExamples(void); + + +int main(int argc, char* argv[]) +{ + bool haveFiles = false; + bool haveTitles = false; + bool haveGenome = false; + bool haveFiller = true; + bool printHeader = false; + bool printEmptyRegions = false; + bool showHelp = false; + string genomeFile; + string basePath; + string noCoverageValue = "0"; + vector<string> inputFiles; + vector<string> inputTitles; + + //Parse command line options + if(argc <= 1) + ShowHelp(); + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp == true) { + ShowHelp(); + exit(1); + } + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + haveFiles = true; + i = i+1; + string file = argv[i]; + while (file[0] != '-' && i < argc) { + inputFiles.push_back(file); + i++; + if (i < argc) + file = argv[i]; + } + i--; + } + } + else if(PARAMETER_CHECK("-names", 6, parameterLength)) { + if ((i+1) < argc) { + haveTitles = true; + i = i+1; + string title = argv[i]; + while (title[0] != '-' && i < argc) { + inputTitles.push_back(title); + i++; + if (i < argc) + title = argv[i]; + } + i--; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-filler", 7, parameterLength)) { + if ((i+1) < argc) { + haveFiller = true; + noCoverageValue = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-header", 7, parameterLength)) { + printHeader = true; + } + else if(PARAMETER_CHECK("-empty", 6, parameterLength)) { + printEmptyRegions = true; + } + else if(PARAMETER_CHECK("-examples", 9, parameterLength)) { + ShowHelp(); + ShowExamples(); + exit(1); + } + } + + //Sanity checks + if (inputFiles.empty() == true) { + cerr << "Error: missing BedGraph file names (-i) to combine." << endl; + exit(1); + } + if (inputFiles.size() == 1) { + cerr << "Error: Only a single BedGraph file was specified. Nothing to combine, exiting." << endl; + exit(1); + } + if (printEmptyRegions && (genomeFile.empty() == true)) { + cerr << "Error: when using -empty, the genome sizes file (-g) must be specified using '-g FILE'." << endl; + exit(1); + } + if ((haveTitles == true) && (inputFiles.size() != inputTitles.size())) { + cerr << "Error: The number of file titles (-names) does not match the number of files (-i)." << endl; + exit(1); + } + + MultiIntersectBed mbi(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue); + if (printHeader) + mbi.PrintHeader(); + mbi.MultiIntersect(); +} + +void ShowHelp(void) { + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Authors: Assaf Gordon, CSHL" << endl; + cerr << " Aaron Quinlan (aaronquinlan@gmail.com)" << endl << endl; + + cerr << "Summary: Combines multiple BedGraph files into a single file," << endl; + cerr << "\t allowing coverage comparisons between them." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i FILE1 FILE2 .. FILEn" << endl; + cerr << "\t Assumes that each BedGraph file is sorted by chrom/start " << endl; + cerr << "\t and that the intervals in each are non-overlapping." << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-header\t\t" << "Print a header line." << endl; + cerr << "\t\t\t(chrom/start/end + names of each file)." << endl << endl; + + cerr << "\t-names\t\t" << "A list of names (one / file) to describe each file in -i." << endl; + cerr << "\t\t\tThese names will be printed in the header line." << endl << endl; + + cerr << "\t-g\t\t" << "Use genome file to calculate empty regions." << endl; + cerr << "\t\t\t- STRING." << endl << endl; + + cerr << "\t-empty\t\t" << "Report empty regions (i.e., start/end intervals w/o" << endl; + cerr << "\t\t\tvalues in all files)." << endl; + cerr << "\t\t\t- Requires the '-g FILE' parameter.\n" << endl; + + cerr << "\t-filler TEXT\t" << "Use TEXT when representing intervals having no value." << endl; + cerr << "\t\t\t- Default is '0', but you can use 'N/A' or any other text." << endl << endl; + + cerr << "\t-examples\t" << "Show detailed usage examples." << endl << endl; +} + + + +void ShowExamples() +{ + cerr << "Example usage:\n\n" \ +"== Input files: ==\n" \ +"\n" \ +" $ cat 1.bg\n" \ +" chr1 1000 1500 10\n" \ +" chr1 2000 2100 20\n" \ +"\n" \ +" $ cat 2.bg\n" \ +" chr1 900 1600 60\n" \ +" chr1 1700 2050 50\n" \ +"\n" \ +" $ cat 3.bg\n" \ +" chr1 1980 2070 80\n" \ +" chr1 2090 2100 20\n" \ +"\n" \ +" $ cat sizes.txt\n" \ +" chr1 5000\n" \ +"\n" \ +"== Union/combine the files: ==\n" \ +"\n" \ +" $ unionBedGraphs -i 1.bg 2.bg 3.bg\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +"\n" \ +"== Union/combine the files, with a header line (titles are the file names): ==\n" \ +"\n" \ +" $ unionBedGraphs -header -i 1.bg 2.bg 3.bg\n" \ +" chrom start end 1 2 3\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +"\n" \ +"== Union/combine the files, with a header line and custom names: ==\n" \ +"\n" \ +" $ unionBedGraphs -header -i 1.bg 2.bg 3.bg -names WT-1 WT-2 KO-1\n" \ +" chrom start end WT-1 WT-2 KO-1\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +"\n" \ +"== Union/combine, showing empty regions (note, requires -g): ==\n" \ +"\n" \ +" $ unionBedGraphs -header -empty -g sizes.TXT -i 1.bg 2.bg 3.bg\n" \ +" chrom start end 1 2 3\n" \ +" chr1 0 900 0 0 0\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1600 1700 0 0 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +" chr1 2100 5000 0 0 0\n" \ +"\n" \ +; +} + +std::string stl_basename(const std::string& path) +{ + string result; + + char* path_dup = strdup(path.c_str()); + char* basename_part = basename(path_dup); + result = basename_part; + free(path_dup); + + size_t pos = result.find_last_of('.'); + if (pos != string::npos ) + result = result.substr(0,pos); + + return result; +} + diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index e225541b..ec51e398 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -119,7 +119,10 @@ BedFile::BedFile(string &bedFile) : bedFile(bedFile), _isGff(false), _isVcf(false), - _typeIsKnown(false) + _typeIsKnown(false), + _merged_start(-1), + _merged_end(-1), + _merged_chrom("") {} // Destructor @@ -193,6 +196,53 @@ BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum) { } +bool BedFile::GetNextMergedBed(BED &merged_bed, int &lineNum) { + + if (_bedStream->good()) { + BED bed; + BedLineStatus bedStatus; + while ((bedStatus = GetNextBed(bed, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + if (((int) bed.start - _merged_end > 0) || + (_merged_end < 0) || + (bed.chrom != _merged_chrom)) + { + if (_merged_start >= 0) { + merged_bed.chrom = _merged_chrom; + merged_bed.start = _merged_start; + merged_bed.end = _merged_end; + + _merged_chrom = bed.chrom; + _merged_start = bed.start; + _merged_end = bed.end; + + return true; + } + else { + _merged_start = bed.start; + _merged_chrom = bed.chrom; + _merged_end = bed.end; + } + } + else if ((int) bed.end > _merged_end) + { + _merged_end = bed.end; + } + } + } + // handle the last merged block in the file. + if (bedStatus == BED_INVALID) + { + merged_bed.chrom = _merged_chrom; + merged_bed.start = _merged_start; + merged_bed.end = _merged_end; + return true; + } + } + return false; +} + + void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool sameStrand, bool diffStrand) { diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 95deb922..61b4eb00 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -421,6 +421,10 @@ public: // Get the next BED entry in an opened BED file. BedLineStatus GetNextBed (BED &bed, int &lineNum); + + // Returns the next MERGED (i.e., non-overlapping) interval in an opened BED file + // NOTE: assumes input file is sorted by chrom then start + bool GetNextMergedBed(BED &merged_bed, int &lineNum); // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs void loadBedFileIntoMap(); @@ -487,6 +491,9 @@ private: istream *_bedStream; string _bedLine; vector<string> _bedFields; + int _merged_start; + int _merged_end; + string _merged_chrom; void setZeroBased(bool zeroBased); void setGff (bool isGff); -- GitLab