From 4f95177029075e3422970506f75d7bda8e2cd726 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Fri, 16 Sep 2011 13:06:20 -0400 Subject: [PATCH] Added new -sorted option to intersectBed. No memory and filthy fast. --- Makefile | 2 +- src/chromsweep/Makefile | 46 ---- src/chromsweep/chromsweepMain.cpp | 277 ---------------------- src/intersectBed/Makefile | 7 +- src/intersectBed/intersectBed.cpp | 107 +++++---- src/intersectBed/intersectBed.h | 11 +- src/intersectBed/intersectMain.cpp | 8 +- src/utils/chromsweep/Makefile | 32 +++ src/{ => utils}/chromsweep/chromsweep.cpp | 21 +- src/{ => utils}/chromsweep/chromsweep.h | 14 +- 10 files changed, 124 insertions(+), 401 deletions(-) delete mode 100644 src/chromsweep/Makefile delete mode 100644 src/chromsweep/chromsweepMain.cpp create mode 100644 src/utils/chromsweep/Makefile rename src/{ => utils}/chromsweep/chromsweep.cpp (91%) rename src/{ => utils}/chromsweep/chromsweep.h (89%) diff --git a/Makefile b/Makefile index d1d4924f..2209af58 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,6 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/bedToBam \ $(SRC_DIR)/bedToIgv \ $(SRC_DIR)/bed12ToBed6 \ - $(SRC_DIR)/chromsweep \ $(SRC_DIR)/closestBed \ $(SRC_DIR)/complementBed \ $(SRC_DIR)/coverageBed \ @@ -45,6 +44,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities \ $(SRC_DIR)/utils/bedFile \ $(SRC_DIR)/utils/bedGraphFile \ + $(SRC_DIR)/utils/chromsweep \ $(SRC_DIR)/utils/gzstream \ $(SRC_DIR)/utils/fileType \ $(SRC_DIR)/utils/bedFilePE \ diff --git a/src/chromsweep/Makefile b/src/chromsweep/Makefile deleted file mode 100644 index 4cd8f20f..00000000 --- a/src/chromsweep/Makefile +++ /dev/null @@ -1,46 +0,0 @@ -UTILITIES_DIR = ../utils/ -OBJ_DIR = ../../obj/ -BIN_DIR = ../../bin/ - -# ------------------- -# define our includes -# ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ - -I$(UTILITIES_DIR)/lineFileUtilities/ \ - -I$(UTILITIES_DIR)/version/ \ - -I$(UTILITIES_DIR)/gzstream/ \ - -I$(UTILITIES_DIR)/fileType/ - -# ---------------------------------- -# define our source and object files -# ---------------------------------- -SOURCES= chromsweepMain.cpp chromsweep.cpp -OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o -EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) -BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) -PROGRAM= chromsweep - -all: $(PROGRAM) - -.PHONY: all - -$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) - @echo " * linking $(PROGRAM)" - @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) - -$(BUILT_OBJECTS): $(SOURCES) - @echo " * compiling" $(*F).cpp - @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) - -$(EXT_OBJECTS): - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ - -clean: - @echo "Cleaning up." - @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* - -.PHONY: clean diff --git a/src/chromsweep/chromsweepMain.cpp b/src/chromsweep/chromsweepMain.cpp deleted file mode 100644 index 90ed322c..00000000 --- a/src/chromsweep/chromsweepMain.cpp +++ /dev/null @@ -1,277 +0,0 @@ -/***************************************************************************** - chromsweepMain.cpp - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0 license. -******************************************************************************/ -#include "chromsweep.h" -#include "version.h" - -using namespace std; - -// define our program name -#define PROGRAM_NAME "chromsweep" - - -// define our parameter checking macro -#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) - -// function declarations -void ShowHelp(void); - -int main(int argc, char* argv[]) { - - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - - // input arguments - float overlapFraction = 1E-9; - - bool haveBedA = false; - bool haveBedB = false; - bool noHit = false; - bool anyHit = false; - bool writeA = false; - bool writeB = false; - bool writeCount = false; - bool writeOverlap = false; - bool writeAllOverlap = false; - bool haveFraction = false; - bool reciprocalFraction = false; - bool forceStrand = false; - bool obeySplits = false; - bool inputIsBam = false; - bool outputIsBam = true; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - outputIsBam = false; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - inputIsBam = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { - outputIsBam = false; - } - else if(PARAMETER_CHECK("-u", 2, parameterLength)) { - anyHit = true; - } - else if(PARAMETER_CHECK("-f", 2, parameterLength)) { - if ((i+1) < argc) { - haveFraction = true; - overlapFraction = atof(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-wa", 3, parameterLength)) { - writeA = true; - } - else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { - writeB = true; - } - else if(PARAMETER_CHECK("-wo", 3, parameterLength)) { - writeOverlap = true; - } - else if(PARAMETER_CHECK("-wao", 4, parameterLength)) { - writeAllOverlap = true; - writeOverlap = true; - } - else if(PARAMETER_CHECK("-c", 2, parameterLength)) { - writeCount = true; - } - else if(PARAMETER_CHECK("-r", 2, parameterLength)) { - reciprocalFraction = true; - } - else if (PARAMETER_CHECK("-v", 2, parameterLength)) { - noHit = true; - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if (PARAMETER_CHECK("-split", 6, parameterLength)) { - obeySplits = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && noHit) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeB && writeCount) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -c, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeCount && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeA && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wa OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeB && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (reciprocalFraction && !haveFraction) { - cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeCount) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeB) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wb, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - - if (!showHelp) { - - ChromSweep *sweep = new ChromSweep(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, - writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, - reciprocalFraction, obeySplits, inputIsBam, outputIsBam); - - pair<BED, vector<BED> > hit_set; - while (sweep->Next(hit_set)) { - sweep->ReportQuery(hit_set.first); - cout << hit_set.second.size() << "\n"; - } - delete sweep; - return 0; - } - else { - ShowHelp(); - } -} - -void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Report overlaps between two feature files." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; - - cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; - cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; - - cerr << "\t-wa\t" << "Write the original entry in A for each overlap." << endl << endl; - - cerr << "\t-wb\t" << "Write the original entry in B for each overlap." << endl; - cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f and -r." << endl << endl; - - cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; - cerr << "\t\tpairs of overlap between the two features." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl; - cerr << "\t\t Only A features with overlap are reported." << endl << endl; - - cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl; - cerr << "\t\tpairs of overlap between the two features." << endl; - cerr << "\t\t- Overlapping features restricted by -f and -r." << endl; - cerr << "\t\t However, A features w/o overlap are also reported" << endl; - cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl; - - cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; - cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; - - cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; - cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; - - cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; - cerr << "\t\t- Similar to \"grep -v\" (an homage)." << endl << endl; - - cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; - cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; - cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; - - cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; - cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; - cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; - - cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; - cerr << "\t\toverlap A on the same strand." << endl; - cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; - - cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; - - - // end the program here - exit(1); - -} diff --git a/src/intersectBed/Makefile b/src/intersectBed/Makefile index 41b4e25e..50ca671a 100644 --- a/src/intersectBed/Makefile +++ b/src/intersectBed/Makefile @@ -12,13 +12,15 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools-Ancillary + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/chromsweep \ + # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= intersectMain.cpp intersectBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o lineFileUtilities.o BamAncillary.o gzstream.o fileType.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o BamAncillary.o gzstream.o fileType.o chromsweep.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= intersectBed @@ -42,6 +44,7 @@ $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/chromsweep/ clean: @echo "Cleaning up." diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index 072c8c27..8f2dd4f3 100644 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -15,7 +15,7 @@ /************************************ Helper functions ************************************/ -bool BedIntersect::processHits(const BED &a, const vector<BED> &hits, bool printable) { +bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) { // how many overlaps are there b/w the bed and the set of hits? int s, e, overlapBases; @@ -37,7 +37,7 @@ bool BedIntersect::processHits(const BED &a, const vector<BED> &hits, bool print if (_reciprocal == false) { hitsFound = true; numOverlaps++; - if (printable == true) + if (_printable == true) ReportOverlapDetail(overlapBases, a, *h, s, e); } // we require there to be sufficient __reciprocal__ overlap @@ -47,7 +47,7 @@ bool BedIntersect::processHits(const BED &a, const vector<BED> &hits, bool print if (bOverlap >= _overlapFraction) { hitsFound = true; numOverlaps++; - if (printable == true) + if (_printable == true) ReportOverlapDetail(overlapBases, a, *h, s, e); } } @@ -66,7 +66,8 @@ bool BedIntersect::processHits(const BED &a, const vector<BED> &hits, bool print BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam) { + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam, + bool sortedInput) { _bedAFile = bedAFile; _bedBFile = bedBFile; @@ -85,11 +86,13 @@ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, _bamInput = bamInput; _bamOutput = bamOutput; _isUncompressedBam = isUncompressedBam; + _sortedInput = sortedInput; - // create new BED file objects for A and B - _bedA = new BedFile(bedAFile); - _bedB = new BedFile(bedBFile); - + // should we print each overlap, or does the user want summary information? + _printable = true; + if (_anyHit || _noHit || _writeCount) + _printable = false; + if (_bamInput == false) IntersectBed(); else @@ -108,14 +111,9 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { bool hitsFound = false; - // should we print each overlap, or does the user want summary information? - bool printable = true; - if (_anyHit || _noHit || _writeCount) - printable = false; - // collect and report the sufficient hits _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand); - hitsFound = processHits(a, hits, printable); + hitsFound = processHits(a, hits); return hitsFound; } @@ -190,42 +188,61 @@ bool BedIntersect::FindOneOrMoreOverlap(const BED &a) { void BedIntersect::IntersectBed() { - // load the "B" file into a map in order to - // compare each entry in A to it in search of overlaps. - _bedB->loadBedFileIntoMap(); - - int lineNum = 0; - vector<BED> hits; - hits.reserve(100); - BED a, nullBed; - BedLineStatus bedStatus; - - // open the "A" file, process each BED entry and searh for overlaps. - _bedA->Open(); - while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - // treat the BED as a single "block" - if (_obeySplits == false) { - FindOverlaps(a, hits); - hits.clear(); - a = nullBed; - } - // split the BED12 into blocks and look for overlaps in each discrete block - else { - bedVector bedBlocks; // vec to store the discrete BED "blocks" - splitBedIntoBlocks(a, lineNum, bedBlocks); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - FindOverlaps(*bedItr, hits); + // create new BED file objects for A and B + _bedA = new BedFile(_bedAFile); + _bedB = new BedFile(_bedBFile); + + if (_sortedInput == false) { + // load the "B" file into a map in order to + // compare each entry in A to it in search of overlaps. + _bedB->loadBedFileIntoMap(); + + int lineNum = 0; + vector<BED> hits; + hits.reserve(100); + BED a, nullBed; + BedLineStatus bedStatus; + + // open the "A" file, process each BED entry and searh for overlaps. + _bedA->Open(); + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + // treat the BED as a single "block" + if (_obeySplits == false) { + FindOverlaps(a, hits); hits.clear(); + a = nullBed; + } + // split the BED12 into blocks and look for overlaps in each discrete block + else { + bedVector bedBlocks; // vec to store the discrete BED "blocks" + splitBedIntoBlocks(a, lineNum, bedBlocks); + + vector<BED>::const_iterator bedItr = bedBlocks.begin(); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + FindOverlaps(*bedItr, hits); + hits.clear(); + } + a = nullBed; } - a = nullBed; } } + _bedA->Close(); + } + else { + // use the chromsweep algorithm to detect overlaps on the fly. + ChromSweep sweep = ChromSweep(_bedA, _bedB, _anyHit, _writeA, _writeB, _writeOverlap, + _writeAllOverlap, _overlapFraction, _noHit, _writeCount, _sameStrand, _diffStrand, + _reciprocal, _obeySplits, _bamInput, _bamOutput); + + // hit_set.first = current entry (a) from A + // hit_set.second = list of hits in B for this a + pair<BED, vector<BED> > hit_set; + while (sweep.Next(hit_set)) { + processHits(hit_set.first, hit_set.second); + } } - _bedA->Close(); } diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index e4106934..1a8d74a3 100644 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -13,7 +13,7 @@ #define INTERSECTBED_H #include "bedFile.h" - +#include "chromsweep.h" #include "api/BamReader.h" #include "api/BamWriter.h" #include "api/BamAux.h" @@ -37,7 +37,8 @@ public: BedIntersect(string bedAFile, string bedBFile, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam); + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam, + bool sortedInput); // destructor ~BedIntersect(void); @@ -67,7 +68,9 @@ private: bool _bamInput; bool _bamOutput; bool _isUncompressedBam; - + bool _sortedInput; + bool _printable; + // instance of a bed file class. BedFile *_bedA, *_bedB; @@ -80,7 +83,7 @@ private: void IntersectBam(string bamFile); - bool processHits(const BED &a, const vector<BED> &hits, bool printable); + bool processHits(const BED &a, const vector<BED> &hits); bool FindOverlaps(const BED &a, vector<BED> &hits); diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index 91d0dd44..a8d393f7 100644 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -53,6 +53,7 @@ int main(int argc, char* argv[]) { bool inputIsBam = false; bool outputIsBam = true; bool uncompressedBam = false; + bool sortedInput = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -142,6 +143,9 @@ int main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { uncompressedBam = true; } + else if(PARAMETER_CHECK("-sorted", 7, parameterLength)) { + sortedInput = true; + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -208,7 +212,7 @@ int main(int argc, char* argv[]) { BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, writeAllOverlap, overlapFraction, noHit, writeCount, sameStrand, diffStrand, - reciprocalFraction, obeySplits, inputIsBam, outputIsBam, uncompressedBam); + reciprocalFraction, obeySplits, inputIsBam, outputIsBam, uncompressedBam, sortedInput); delete bi; return 0; } @@ -281,6 +285,8 @@ void ShowHelp(void) { cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; + cerr << "\t-sorted\t" << "Use the \"chromsweep\" algorithm for sorted (-k1,1 -k2,2n) input" << endl << endl; + // end the program here exit(1); diff --git a/src/utils/chromsweep/Makefile b/src/utils/chromsweep/Makefile new file mode 100644 index 00000000..005e928d --- /dev/null +++ b/src/utils/chromsweep/Makefile @@ -0,0 +1,32 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= chromsweep.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=lineFileUtilities.o fileType.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean \ No newline at end of file diff --git a/src/chromsweep/chromsweep.cpp b/src/utils/chromsweep/chromsweep.cpp similarity index 91% rename from src/chromsweep/chromsweep.cpp rename to src/utils/chromsweep/chromsweep.cpp index 70c7cdc8..e4788e31 100644 --- a/src/chromsweep/chromsweep.cpp +++ b/src/utils/chromsweep/chromsweep.cpp @@ -12,7 +12,6 @@ #include "lineFileUtilities.h" #include "chromsweep.h" #include <queue> -#include <set> bool after(const BED &a, const BED &b); void report_hits(const BED &curr_qy, const vector<BED> &hits); @@ -22,13 +21,13 @@ vector<BED> scan_cache(const BED &curr_qy, BedLineStatus qy_status, const vector /* Constructor */ -ChromSweep::ChromSweep(string bedAFile, string bedBFile, bool anyHit, +ChromSweep::ChromSweep(BedFile *bedA, BedFile *bedB, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, - float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { - _bedAFile = bedAFile; - _bedBFile = bedBFile; + _bedA = bedA; + _bedB = bedB; _anyHit = anyHit; _noHit = noHit; _writeA = writeA; @@ -37,20 +36,12 @@ ChromSweep::ChromSweep(string bedAFile, string bedBFile, bool anyHit, _writeAllOverlap = writeAllOverlap; _writeCount = writeCount; _overlapFraction = overlapFraction; - _forceStrand = forceStrand; + _sameStrand = sameStrand; + _diffStrand = diffStrand; _reciprocal = reciprocal; _obeySplits = obeySplits; _bamInput = bamInput; _bamOutput = bamOutput; - - if (_anyHit || _noHit || _writeCount) - _printable = false; - else - _printable = true; - - // create new BED file objects for A and B - _bedA = new BedFile(bedAFile); - _bedB = new BedFile(bedBFile); // prime the results pump. _qy_lineNum = 0; diff --git a/src/chromsweep/chromsweep.h b/src/utils/chromsweep/chromsweep.h similarity index 89% rename from src/chromsweep/chromsweep.h rename to src/utils/chromsweep/chromsweep.h index 1fb58405..5a18ab54 100644 --- a/src/chromsweep/chromsweep.h +++ b/src/utils/chromsweep/chromsweep.h @@ -13,13 +13,6 @@ #define CHROMSWEEP_H #include "bedFile.h" -// #include "BamReader.h" -// #include "BamWriter.h" -// #include "BamAncillary.h" -// #include "BamAux.h" -// using namespace BamTools; - - #include <vector> #include <queue> #include <iostream> @@ -34,9 +27,9 @@ class ChromSweep { public: // constructor - ChromSweep(string bedAFile, string bedBFile, bool anyHit, + ChromSweep(BedFile *bedA, BedFile *bedB, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, - float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput); // destructor @@ -59,7 +52,8 @@ private: bool _writeOverlap; bool _writeAllOverlap; - bool _forceStrand; + bool _sameStrand; + bool _diffStrand; bool _reciprocal; float _overlapFraction; -- GitLab