From 272344949d8ecd905b68665e61f5e469c2158cfc Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Mon, 12 Apr 2010 09:58:29 -0400 Subject: [PATCH] Added the -wao option to intersectBed. - Allows one to report 0 overlap for non-overlapping A features - Changed all private memeber variables to use "_VAR" notation. - Added convenience functions for reporting overlaps. --- src/intersectBed/intersectBed.cpp | 226 ++++++++++++++++------------- src/intersectBed/intersectBed.h | 73 +++++----- src/intersectBed/intersectMain.cpp | 18 ++- src/utils/bedFile/bedFile.cpp | 64 ++++++++ src/utils/bedFile/bedFile.h | 2 + 5 files changed, 243 insertions(+), 140 deletions(-) diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index b8caf0f6..3191a9ae 100755 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -17,27 +17,28 @@ Constructor */ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, bool writeOverlap, float overlapFraction, - bool noHit, bool writeCount, bool forceStrand, bool reciprocal, - bool bamInput, bool bamOutput) { + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool bamInput, bool bamOutput) { - this->bedAFile = bedAFile; - this->bedBFile = bedBFile; - this->anyHit = anyHit; - this->noHit = noHit; - this->writeA = writeA; - this->writeB = writeB; - this->writeOverlap = writeOverlap; - this->writeCount = writeCount; - this->overlapFraction = overlapFraction; - this->forceStrand = forceStrand; - this->reciprocal = reciprocal; - this->bamInput = bamInput; - this->bamOutput = bamOutput; + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _anyHit = anyHit; + _noHit = noHit; + _writeA = writeA; + _writeB = writeB; + _writeOverlap = writeOverlap; + _writeAllOverlap = writeAllOverlap; + _writeCount = writeCount; + _overlapFraction = overlapFraction; + _forceStrand = forceStrand; + _reciprocal = reciprocal; + _bamInput = bamInput; + _bamOutput = bamOutput; // create new BED file objects for A and B - this->bedA = new BedFile(bedAFile); - this->bedB = new BedFile(bedBFile); + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); } @@ -53,109 +54,117 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { bool hitsFound = false; // grab _all_ of the features in B that overlap with a. - bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, this->forceStrand); + _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); // how many overlaps are there b/w a and B? int numOverlaps = 0; // should we print each overlap, or does the user want summary information? bool printable = true; - if (anyHit || noHit || writeCount) { + if (_anyHit || _noHit || _writeCount) printable = false; - } // loop through the hits and report those that meet the user's criteria vector<BED>::const_iterator h = hits.begin(); vector<BED>::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { - int s = max(a.start, h->start); - int e = min(a.end, h->end); + int s = max(a.start, h->start); + int e = min(a.end, h->end); int overlapBases = (e - s); // the number of overlapping bases b/w a and b - int aLength = (a.end - a.start); // the length of a in b.p. + int aLength = (a.end - a.start); // the length of a in b.p. // is there enough overlap relative to the user's request? (default ~ 1bp) - if ( ( (float) overlapBases / (float) aLength ) >= this->overlapFraction ) { + if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { // Report the hit if the user doesn't care about reciprocal overlap between A and B. - if (!reciprocal) { - + if (_reciprocal == false) { hitsFound = true; - numOverlaps++; // we have another hit for A - - if (printable == true) { - if (writeA == false && writeB == false && writeOverlap == false) { - bedA->reportBedRangeNewLine(a,s,e); - } - else if (writeA == true && writeB == true) { - bedA->reportBedTab(a); - bedB->reportBedNewLine(*h); - } - else if (writeA == true) { - bedA->reportBedNewLine(a); - } - else if (writeB == true) { - bedA->reportBedRangeTab(a,s,e); - bedB->reportBedNewLine(*h); - } - else if (writeOverlap == true) { - bedA->reportBedTab(a); - bedB->reportBedTab(*h); - printf("%d\n", overlapBases); - } - } - + numOverlaps++; } - else { // the user wants there to be sufficient reciprocal overlap - int bLength = (h->end - h->start); + // we require there to be sufficient __reciprocal__ overlap + else { + int bLength = (h->end - h->start); float bOverlap = ( (float) overlapBases / (float) bLength ); - - if (bOverlap >= this->overlapFraction) { + if (bOverlap >= _overlapFraction) { hitsFound = true; - numOverlaps++; // we have another hit for A - - if (!writeB && printable) { - if (writeA) bedA->reportBedNewLine(a); - else bedA->reportBedRangeNewLine(a,s,e); - } - else if (printable) { - if (writeA) { - bedA->reportBedTab(a); - bedB->reportBedNewLine(*h); - } - else { - bedA->reportBedRangeTab(a,s,e); - bedB->reportBedNewLine(*h); - } - } + numOverlaps++; } } + // report the overlap per the user's request. + if (printable == true) { + ReportOverlapDetail(overlapBases, a, *h, s, e); + } } } - if (anyHit && (numOverlaps >= 1)) { - bedA->reportBedNewLine(a); + // report the summary of the overlaps if requested. + ReportOverlapSummary(a, numOverlaps); + // were hits found for this BED feature? + return hitsFound; +} + + +void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, + const int &s, const int &e) { + // simple intersection only + if (_writeA == false && _writeB == false && _writeOverlap == false) { + _bedA->reportBedRangeNewLine(a,s,e); + } + // write the original A and B + else if (_writeA == true && _writeB == true) { + _bedA->reportBedTab(a); + _bedB->reportBedNewLine(b); } - else if (writeCount) { - bedA->reportBedTab(a); - printf("%d\n", numOverlaps); + // write just the original A + else if (_writeA == true) { + _bedA->reportBedNewLine(a); } - else if (noHit && (numOverlaps == 0)) { - bedA->reportBedNewLine(a); + // write the intersected portion of A and the original B + else if (_writeB == true) { + _bedA->reportBedRangeTab(a,s,e); + _bedB->reportBedNewLine(b); } + // write the original A and B plus the no. of overlapping bases. + else if (_writeOverlap == true) { + _bedA->reportBedTab(a); + _bedB->reportBedTab(b); + printf("%d\n", overlapBases); + } +} - return hitsFound; + +void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { + // -u just report the fact that there was >= 1 overlaps + if (_anyHit && (numOverlapsFound >= 1)) { + _bedA->reportBedNewLine(a); + } + // -c report the total number of features overlapped in B + else if (_writeCount) { + _bedA->reportBedTab(a); + printf("%d\n", numOverlapsFound); + } + // -v report iff there were no overlaps + else if (_noHit && (numOverlapsFound == 0)) { + _bedA->reportBedNewLine(a); + } + // -wao the user wants to force the reporting of 0 overlap + else if (_writeAllOverlap && (numOverlapsFound == 0)) { + _bedA->reportBedTab(a); + _bedB->reportNullBedTab(); + printf("0\n"); + } } bool BedIntersect::FindOneOrMoreOverlap(const BED &a) { bool overlapsFound; - if (this->reciprocal == false) { - overlapsFound = bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, - this->forceStrand, this->overlapFraction); + if (_reciprocal == false) { + overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, + _forceStrand, _overlapFraction); } else { - overlapsFound = bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, - this->forceStrand, this->overlapFraction); + overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, + _forceStrand, _overlapFraction); } return overlapsFound; } @@ -165,7 +174,7 @@ void BedIntersect::IntersectBed(istream &bedInput) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps - bedB->loadBedFileIntoMap(); + _bedB->loadBedFileIntoMap(); string bedLine; int lineNum = 0; // current input line number vector<BED> hits; // vector of potential hits @@ -182,7 +191,7 @@ void BedIntersect::IntersectBed(istream &bedInput) { Tokenize(bedLine,bedFields); BED a; // find the overlaps with B if it's a valid BED entry. - if (bedA->parseLine(a, bedFields, lineNum)) { + if (_bedA->parseLine(a, bedFields, lineNum)) { FindOverlaps(a, hits); hits.clear(); } @@ -196,7 +205,7 @@ void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps - bedB->loadBedFileIntoMap(); + _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; @@ -204,11 +213,11 @@ void BedIntersect::IntersectBam(string bamFile) { reader.Open(bamFile); // get header & reference information - string header = reader.GetHeaderText(); + string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM - if (this->bamOutput == true) { + if (_bamOutput == true) { // open our BAM writer writer.Open("stdout", header, refs); } @@ -217,7 +226,7 @@ void BedIntersect::IntersectBam(string bamFile) { // reserve some space hits.reserve(100); - bedA->bedType = 6; + _bedA->bedType = 6; BamAlignment bam; bool overlapsFound; // get each set of alignments for each pair. @@ -227,23 +236,25 @@ void BedIntersect::IntersectBam(string bamFile) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; - a.end = bam.Position + bam.AlignedBases.size(); + a.end = bam.Position + bam.AlignedBases.size(); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; - a.score = ToString(bam.MapQuality); + a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; - if (this->bamOutput == true) { + if (_bamOutput == true) { overlapsFound = FindOneOrMoreOverlap(a); if (overlapsFound == true) { - if (!this->noHit) writer.SaveAlignment(bam); + if (_noHit == false) + writer.SaveAlignment(bam); } else { - if (this->noHit) writer.SaveAlignment(bam); + if (_noHit == false) + writer.SaveAlignment(bam); } } else { @@ -253,8 +264,9 @@ void BedIntersect::IntersectBam(string bamFile) { } } + // close the relevant BAM files. reader.Close(); - if (this->bamOutput == true) { + if (_bamOutput == true) { writer.Close(); } } @@ -262,23 +274,29 @@ void BedIntersect::IntersectBam(string bamFile) { void BedIntersect::DetermineBedInput() { - if (bedA->bedFile != "stdin") { // process a file - if (this->bamInput == false) { //bed/gff - ifstream beds(bedA->bedFile.c_str(), ios::in); + // dealing with a proper file + if (_bedA->bedFile != "stdin") { + // it's BED or GFF + if (_bamInput == false) { + ifstream beds(_bedA->bedFile.c_str(), ios::in); if ( !beds ) { - cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; + cerr << "Error: The requested bed file (" << _bedA->bedFile << ") could not be opened. Exiting!" << endl; exit (1); } IntersectBed(beds); } - else { // bam - IntersectBam(bedA->bedFile); + // it's BAM + else { + IntersectBam(_bedA->bedFile); } } - else { // process stdin - if (this->bamInput == false) { //bed/gff + // reading from stdin + else { + // it's BED or GFF + if (_bamInput == false) { IntersectBed(cin); } + // it's BAM else { IntersectBam("stdin"); } diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index 85e135c4..3d952462 100755 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -13,67 +13,74 @@ #define INTERSECTBED_H #include "bedFile.h" + #include "BamReader.h" #include "BamWriter.h" #include "BamAux.h" using namespace BamTools; + #include <vector> #include <iostream> #include <fstream> #include <stdlib.h> - using namespace std; -//************************************************ -// Class methods and elements -//************************************************ + class BedIntersect { public: // constructor BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, bool writeOverlap, float overlapFraction, - bool noHit, bool writeCount, bool forceStrand, bool reciprocal, - bool bamInput, bool bamOutput); + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool bamInput, bool bamOutput); // destructor ~BedIntersect(void); + void DetermineBedInput(); - void reportAIntersect(const BED &, int &, int &); - void reportA(const BED &); - void reportB(const BED &); +private: + + //------------------------------------------------ + // private attributes + //------------------------------------------------ + string _bedAFile; + string _bedBFile; + + bool _writeA; // should the original A feature be reported? + bool _writeB; // should the original B feature be reported? + bool _writeOverlap; + bool _writeAllOverlap; - bool FindOverlaps(const BED &a, vector<BED> &hits); - bool FindOneOrMoreOverlap(const BED &a); + bool _forceStrand; + bool _reciprocal; + float _overlapFraction; + + bool _anyHit; + bool _noHit; + bool _writeCount; // do we want a count of the number of overlaps in B? + + bool _bamInput; + bool _bamOutput; + // instance of a bed file class. + BedFile *_bedA, *_bedB; + + //------------------------------------------------ + // private methods + //------------------------------------------------ void IntersectBed(istream &bedInput); void IntersectBam(string bamFile); - - void DetermineBedInput(); - -private: + bool FindOverlaps(const BED &a, vector<BED> &hits); + bool FindOneOrMoreOverlap(const BED &a); - string bedAFile; - string bedBFile; - string notInBFile; - bool anyHit; - bool writeA; - bool writeB; - bool writeCount; - bool writeOverlap; - bool forceStrand; - bool reciprocal; - float overlapFraction; - bool noHit; - bool bamInput; - bool bamOutput; + void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, + const int &s, const int &e); + void ReportOverlapSummary(const BED &a, const int &numOverlapsFound); - // instance of a bed file class. - BedFile *bedA, *bedB; - }; #endif /* INTERSECTBED_H */ diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index fce77147..086cf603 100755 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -44,6 +44,7 @@ int main(int argc, char* argv[]) { bool writeB = false; bool writeCount = false; bool writeOverlap = false; + bool writeAllOverlap = false; bool haveFraction = false; bool reciprocalFraction = false; bool forceStrand = false; @@ -114,6 +115,10 @@ int main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-wo", 3, parameterLength)) { writeOverlap = true; } + else if(PARAMETER_CHECK("-wao", 4, parameterLength)) { + writeAllOverlap = true; + writeOverlap = true; + } else if(PARAMETER_CHECK("-c", 2, parameterLength)) { writeCount = true; } @@ -187,7 +192,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, - overlapFraction, noHit, writeCount, forceStrand, + writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, reciprocalFraction, inputIsBam, outputIsBam); bi->DetermineBedInput(); return 0; @@ -221,8 +226,15 @@ void ShowHelp(void) { cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; cerr << "\t\tpairs of overlap between the two features." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; - + cerr << "\t\t- Overlaps restricted by -f and -r." << endl; + cerr << "\t\t Only A features with overlap are reported." << endl << endl; + + cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlapping features restricted by -f and -r." << endl; + cerr << "\t\t However, A features w/o overlap are also reported" << endl << endl; + cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl; + cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index c6838214..b72f1196 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -877,3 +877,67 @@ void BedFile::reportBedRangeNewLine(const BED &bed, int start, int end) { } } + + +/* + reportNullBedTab +*/ +void BedFile::reportNullBedTab() { + + if (isGff == false) { + if (this->bedType == 3) { + printf (".\t-1\t-1\t"); + } + else if (this->bedType == 4) { + printf (".\t-1\t-1\t.\t"); + } + else if (this->bedType == 5) { + printf (".\t-1\t-1\t.\t-1\t"); + } + else if (this->bedType == 6) { + printf (".\t-1\t-1\t.\t-1\t.\t"); + } + else if (this->bedType > 6) { + printf (".\t-1\t-1\t.\t-1\t.\t"); + for (unsigned int i = 6; i < this->bedType; ++i) { + printf(".\t"); + } + } + } + else if (this->bedType == 9) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); + } +} + + +/* + reportNullBedTab +*/ +void BedFile::reportNullBedNewLine() { + + if (isGff == false) { + if (this->bedType == 3) { + printf (".\t-1\t-1\n"); + } + else if (this->bedType == 4) { + printf (".\t-1\t-1\t.\n"); + } + else if (this->bedType == 5) { + printf (".\t-1\t-1\t.\t-1\n"); + } + else if (this->bedType == 6) { + printf (".\t-1\t-1\t.\t-1\t.\n"); + } + else if (this->bedType > 6) { + printf (".\t-1\t-1\t.\t-1\t.\n"); + for (unsigned int i = 6; i < this->bedType; ++i) { + printf(".\t"); + } + printf("\n"); + } + } + else if (this->bedType == 9) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); + } +} + diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index a0a916a5..86eef4cb 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -176,6 +176,8 @@ public: void reportBedNewLine(const BED &); void reportBedRangeTab(const BED &bed, int start, int end); void reportBedRangeNewLine(const BED &bed, int start, int end); + void reportNullBedTab(void); + void reportNullBedNewLine(void); // parse an input line and determine how it should be handled bool parseLine (BED &bed, const vector<string> &lineVector, int &lineNum); -- GitLab