From 272344949d8ecd905b68665e61f5e469c2158cfc Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Mon, 12 Apr 2010 09:58:29 -0400
Subject: [PATCH] Added the -wao option to intersectBed.

	- Allows one to report 0 overlap for non-overlapping A features
	- Changed all private memeber variables to use "_VAR" notation.
	- Added convenience functions for reporting overlaps.
---
 src/intersectBed/intersectBed.cpp  | 226 ++++++++++++++++-------------
 src/intersectBed/intersectBed.h    |  73 +++++-----
 src/intersectBed/intersectMain.cpp |  18 ++-
 src/utils/bedFile/bedFile.cpp      |  64 ++++++++
 src/utils/bedFile/bedFile.h        |   2 +
 5 files changed, 243 insertions(+), 140 deletions(-)

diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp
index b8caf0f6..3191a9ae 100755
--- a/src/intersectBed/intersectBed.cpp
+++ b/src/intersectBed/intersectBed.cpp
@@ -17,27 +17,28 @@
 	Constructor
 */
 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, 
-						   bool writeA, bool writeB, bool writeOverlap, float overlapFraction, 
-						   bool noHit, bool writeCount, bool forceStrand, bool reciprocal,
-						   bool bamInput, bool bamOutput) {
+						   bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
+						   float overlapFraction, bool noHit, bool writeCount, bool forceStrand, 
+						   bool reciprocal, bool bamInput, bool bamOutput) {
 
-	this->bedAFile        = bedAFile;
-	this->bedBFile        = bedBFile;
-	this->anyHit          = anyHit;
-	this->noHit           = noHit;
-	this->writeA          = writeA;	
-	this->writeB          = writeB;
-	this->writeOverlap    = writeOverlap;	
-	this->writeCount      = writeCount;
-	this->overlapFraction = overlapFraction;
-	this->forceStrand     = forceStrand;
-	this->reciprocal      = reciprocal;
-	this->bamInput        = bamInput;
-	this->bamOutput       = bamOutput;
+	_bedAFile            = bedAFile;
+	_bedBFile            = bedBFile;
+	_anyHit              = anyHit;
+	_noHit               = noHit;
+	_writeA              = writeA;	
+	_writeB              = writeB;
+	_writeOverlap        = writeOverlap;
+	_writeAllOverlap     = writeAllOverlap;		
+	_writeCount          = writeCount;
+	_overlapFraction     = overlapFraction;
+	_forceStrand         = forceStrand;
+	_reciprocal          = reciprocal;
+	_bamInput            = bamInput;
+	_bamOutput           = bamOutput;
 	
 	// create new BED file objects for A and B
-	this->bedA = new BedFile(bedAFile);
-	this->bedB = new BedFile(bedBFile);
+	_bedA = new BedFile(bedAFile);
+	_bedB = new BedFile(bedBFile);
 }
 
 
@@ -53,109 +54,117 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) {
 	bool hitsFound = false;
 	
 	// grab _all_ of the features in B that overlap with a.
-	bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, this->forceStrand); 
+	_bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); 
 	
 	// how many overlaps are there b/w a and B?
 	int numOverlaps = 0;		
 	
 	// should we print each overlap, or does the user want summary information?
 	bool printable = true;			
-	if (anyHit || noHit || writeCount) {
+	if (_anyHit || _noHit || _writeCount)
 		printable = false;
-	}
 	
 	// loop through the hits and report those that meet the user's criteria
 	vector<BED>::const_iterator h = hits.begin();
 	vector<BED>::const_iterator hitsEnd = hits.end();
 	for (; h != hitsEnd; ++h) {
 		
-		int s = max(a.start, h->start);
-		int e = min(a.end, h->end);
+		int s            = max(a.start, h->start);
+		int e            = min(a.end, h->end);
 		int overlapBases = (e - s);				// the number of overlapping bases b/w a and b
-		int aLength = (a.end - a.start);		// the length of a in b.p.
+		int aLength      = (a.end - a.start);   // the length of a in b.p.
 		
 		// is there enough overlap relative to the user's request? (default ~ 1bp)
-		if ( ( (float) overlapBases / (float) aLength ) >= this->overlapFraction ) { 
+		if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { 
 		
 			// Report the hit if the user doesn't care about reciprocal overlap between A and B.
-			if (!reciprocal) {
-				
+			if (_reciprocal == false) {
 				hitsFound = true;
-				numOverlaps++;		// we have another hit for A
-
-				if (printable == true) {
-					if (writeA == false && writeB == false && writeOverlap == false) {
-						bedA->reportBedRangeNewLine(a,s,e);
-					}
-					else if (writeA == true && writeB == true) {
-						bedA->reportBedTab(a);
-						bedB->reportBedNewLine(*h);
-					}
-					else if (writeA == true) {
-						bedA->reportBedNewLine(a);
-					}
-					else if (writeB == true) {
-						bedA->reportBedRangeTab(a,s,e);
-						bedB->reportBedNewLine(*h);
-					}
-					else if (writeOverlap == true) {
-						bedA->reportBedTab(a);
-						bedB->reportBedTab(*h);
-						printf("%d\n", overlapBases);
-					}
-				}
-				
+				numOverlaps++;
 			}
-			else {			// the user wants there to be sufficient reciprocal overlap
-				int bLength = (h->end - h->start);
+			// we require there to be sufficient __reciprocal__ overlap
+			else {			
+				int bLength    = (h->end - h->start);
 				float bOverlap = ( (float) overlapBases / (float) bLength );
-			
-				if (bOverlap >= this->overlapFraction) {
+				if (bOverlap >= _overlapFraction) {
 					hitsFound = true;
-					numOverlaps++;		// we have another hit for A
-				
-					if (!writeB && printable) {
-						if (writeA) bedA->reportBedNewLine(a);
-						else bedA->reportBedRangeNewLine(a,s,e);
-					}
-					else if (printable) {
-						if (writeA) {
-							bedA->reportBedTab(a);
-							bedB->reportBedNewLine(*h);
-						}
-						else {
-							bedA->reportBedRangeTab(a,s,e);
-							bedB->reportBedNewLine(*h);									
-						}
-					}
+					numOverlaps++;
 				}
 			}
+			// report the overlap per the user's request.
+			if (printable == true) {
+				ReportOverlapDetail(overlapBases, a, *h, s, e);
+			}
 		}
 	}
-	if (anyHit && (numOverlaps >= 1)) {
-		bedA->reportBedNewLine(a);
+	// report the summary of the overlaps if requested.
+	ReportOverlapSummary(a, numOverlaps);
+	// were hits found for this BED feature?
+	return hitsFound;
+}
+
+
+void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b,
+									   const int &s, const int &e) {
+	// simple intersection only
+	if (_writeA == false && _writeB == false && _writeOverlap == false) {
+		_bedA->reportBedRangeNewLine(a,s,e);
+	}
+	// write the original A and B 
+	else if (_writeA == true && _writeB == true) {
+		_bedA->reportBedTab(a);
+		_bedB->reportBedNewLine(b);
 	}
-	else if (writeCount) {
-		bedA->reportBedTab(a); 
-		printf("%d\n", numOverlaps);
+	// write just the original A
+	else if (_writeA == true) {
+		_bedA->reportBedNewLine(a);
 	}
-	else if (noHit && (numOverlaps == 0)) {
-		bedA->reportBedNewLine(a);
+	// write the intersected portion of A and the original B 
+	else if (_writeB == true) {
+		_bedA->reportBedRangeTab(a,s,e);
+		_bedB->reportBedNewLine(b);
 	}
+	// write the original A and B plus the no. of overlapping bases.
+	else if (_writeOverlap == true) {
+		_bedA->reportBedTab(a);
+		_bedB->reportBedTab(b);
+		printf("%d\n", overlapBases);
+	}
+}
 
-	return hitsFound;
+
+void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) {
+	// -u  just report the fact that there was >= 1 overlaps
+	if (_anyHit && (numOverlapsFound >= 1)) {
+		_bedA->reportBedNewLine(a);
+	}
+	// -c  report the total number of features overlapped in B
+	else if (_writeCount) {
+		_bedA->reportBedTab(a); 
+		printf("%d\n", numOverlapsFound);
+	}
+	// -v  report iff there were no overlaps
+	else if (_noHit && (numOverlapsFound == 0)) {
+		_bedA->reportBedNewLine(a);
+	}
+	// -wao the user wants to force the reporting of 0 overlap
+	else if (_writeAllOverlap && (numOverlapsFound == 0)) {
+		_bedA->reportBedTab(a);
+		_bedB->reportNullBedTab();
+		printf("0\n");
+	}
 }
 
 
 bool BedIntersect::FindOneOrMoreOverlap(const BED &a) {
 	bool overlapsFound;
-	if (this->reciprocal == false) {
-		overlapsFound = bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, 
-			this->forceStrand, this->overlapFraction); 
+	if (_reciprocal == false) {
+		overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, 
+			                                              _forceStrand, _overlapFraction); 
 	}
 	else {
-		overlapsFound = bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, 
-			this->forceStrand, this->overlapFraction);
+		overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, 
+			                                                        _forceStrand, _overlapFraction);
 	}
 	return overlapsFound;
 }
@@ -165,7 +174,7 @@ void BedIntersect::IntersectBed(istream &bedInput) {
 
 	// load the "B" bed file into a map so
 	// that we can easily compare "A" to it for overlaps
-	bedB->loadBedFileIntoMap();
+	_bedB->loadBedFileIntoMap();
 	string bedLine;                                                                                                                    
 	int lineNum = 0;					// current input line number
 	vector<BED> hits;					// vector of potential hits
@@ -182,7 +191,7 @@ void BedIntersect::IntersectBed(istream &bedInput) {
 		Tokenize(bedLine,bedFields);
 		BED a;
 		// find the overlaps with B if it's a valid BED entry. 
-		if (bedA->parseLine(a, bedFields, lineNum)) {
+		if (_bedA->parseLine(a, bedFields, lineNum)) {
 			FindOverlaps(a, hits);
 			hits.clear();
 		}
@@ -196,7 +205,7 @@ void BedIntersect::IntersectBam(string bamFile) {
 
 	// load the "B" bed file into a map so
 	// that we can easily compare "A" to it for overlaps
-	bedB->loadBedFileIntoMap();
+	_bedB->loadBedFileIntoMap();
 	
 	// open the BAM file
 	BamReader reader;
@@ -204,11 +213,11 @@ void BedIntersect::IntersectBam(string bamFile) {
 	reader.Open(bamFile);
 
 	// get header & reference information
-	string header = reader.GetHeaderText();
+	string header  = reader.GetHeaderText();
 	RefVector refs = reader.GetReferenceData();
 
 	// open a BAM output to stdout if we are writing BAM
-	if (this->bamOutput == true) {
+	if (_bamOutput == true) {
 		// open our BAM writer
 		writer.Open("stdout", header, refs);
 	}
@@ -217,7 +226,7 @@ void BedIntersect::IntersectBam(string bamFile) {
 	// reserve some space
 	hits.reserve(100);
 	
-	bedA->bedType = 6;
+	_bedA->bedType = 6;
 	BamAlignment bam;	
 	bool overlapsFound;
 	// get each set of alignments for each pair.
@@ -227,23 +236,25 @@ void BedIntersect::IntersectBam(string bamFile) {
 			BED a;
 			a.chrom = refs.at(bam.RefID).RefName;
 			a.start = bam.Position;
-			a.end = bam.Position + bam.AlignedBases.size();
+			a.end   = bam.Position + bam.AlignedBases.size();
 
 			// build the name field from the BAM alignment.
 			a.name = bam.Name;
 			if (bam.IsFirstMate()) a.name += "/1";
 			if (bam.IsSecondMate()) a.name += "/2";
 
-			a.score = ToString(bam.MapQuality);
+			a.score  = ToString(bam.MapQuality);
 			a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; 
 	
-			if (this->bamOutput == true) {
+			if (_bamOutput == true) {
 				overlapsFound = FindOneOrMoreOverlap(a);
 				if (overlapsFound == true) {
-					if (!this->noHit) writer.SaveAlignment(bam);
+					if (_noHit == false) 
+						writer.SaveAlignment(bam);
 				}
 				else {
-					if (this->noHit) writer.SaveAlignment(bam);	
+					if (_noHit == false) 
+						writer.SaveAlignment(bam);	
 				}
 			}
 			else {
@@ -253,8 +264,9 @@ void BedIntersect::IntersectBam(string bamFile) {
 		}
 	}
 	
+	// close the relevant BAM files.
 	reader.Close();
-	if (this->bamOutput == true) {
+	if (_bamOutput == true) {
 		writer.Close();
 	}
 }
@@ -262,23 +274,29 @@ void BedIntersect::IntersectBam(string bamFile) {
 
 void BedIntersect::DetermineBedInput() {
 	
-	if (bedA->bedFile != "stdin") {   // process a file
-		if (this->bamInput == false) { //bed/gff
-			ifstream beds(bedA->bedFile.c_str(), ios::in);
+	// dealing with a proper file
+	if (_bedA->bedFile != "stdin") {   
+		// it's BED or GFF
+		if (_bamInput == false) { 
+			ifstream beds(_bedA->bedFile.c_str(), ios::in);
 			if ( !beds ) {
-				cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl;
+				cerr << "Error: The requested bed file (" << _bedA->bedFile << ") could not be opened. Exiting!" << endl;
 				exit (1);
 			}
 			IntersectBed(beds);
 		}
-		else {	// bam
-			IntersectBam(bedA->bedFile);
+		// it's BAM
+		else {
+			IntersectBam(_bedA->bedFile);
 		}
 	}
-	else {   // process stdin
-		if (this->bamInput == false) {	//bed/gff					
+	// reading from stdin
+	else {  
+		// it's BED or GFF 
+		if (_bamInput == false) {					
 			IntersectBed(cin);
 		}
+		// it's BAM
 		else {
 			IntersectBam("stdin");
 		}				
diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h
index 85e135c4..3d952462 100755
--- a/src/intersectBed/intersectBed.h
+++ b/src/intersectBed/intersectBed.h
@@ -13,67 +13,74 @@
 #define INTERSECTBED_H
 
 #include "bedFile.h"
+
 #include "BamReader.h"
 #include "BamWriter.h"
 #include "BamAux.h"
 using namespace BamTools;
 
+
 #include <vector>
 #include <iostream>
 #include <fstream>
 #include <stdlib.h>
-
 using namespace std;
 
-//************************************************
-// Class methods and elements
-//************************************************
+
 class BedIntersect {
 
 public:
 
 	// constructor 
 	BedIntersect(string bedAFile, string bedBFile, bool anyHit, 
-							   bool writeA, bool writeB, bool writeOverlap, float overlapFraction, 
-							   bool noHit, bool writeCount, bool forceStrand, bool reciprocal,
-							   bool bamInput, bool bamOutput);
+							   bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
+							   float overlapFraction, bool noHit, bool writeCount, bool forceStrand, 
+							   bool reciprocal, bool bamInput, bool bamOutput);
 
 	// destructor
 	~BedIntersect(void);
+	void DetermineBedInput();
 	
-	void reportAIntersect(const BED &, int &, int &);	
-	void reportA(const BED &);
-	void reportB(const BED &);
+private:
+	
+	//------------------------------------------------
+	// private attributes
+	//------------------------------------------------
+	string _bedAFile;
+	string _bedBFile;
+	
+	bool  _writeA;            // should the original A feature be reported?
+	bool  _writeB;            // should the original B feature be reported?
+	bool  _writeOverlap;
+	bool  _writeAllOverlap;
 
-	bool FindOverlaps(const BED &a, vector<BED> &hits);
-	bool FindOneOrMoreOverlap(const BED &a);
+	bool  _forceStrand;
+	bool  _reciprocal;
+	float _overlapFraction;
+
+	bool  _anyHit;
+	bool  _noHit;
+	bool  _writeCount;        // do we want a count of the number of overlaps in B?
+
+	bool  _bamInput;
+	bool  _bamOutput;
 	
+	// instance of a bed file class.
+	BedFile *_bedA, *_bedB;
+
+	//------------------------------------------------
+	// private methods
+	//------------------------------------------------
 	void IntersectBed(istream &bedInput);
 	void IntersectBam(string bamFile);
-		
-	void DetermineBedInput();
-	
 	
-private:
+	bool FindOverlaps(const BED &a, vector<BED> &hits);
+	bool FindOneOrMoreOverlap(const BED &a);
 
-	string bedAFile;
-	string bedBFile;
-	string notInBFile;
-	bool anyHit;
-	bool writeA;
-	bool writeB;
-	bool writeCount;
-	bool writeOverlap;
-	bool forceStrand;
-	bool reciprocal;
-	float overlapFraction;
-	bool noHit;
-	bool bamInput;
-	bool bamOutput;
+	void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b,
+		                     const int &s, const int &e);
+	void ReportOverlapSummary(const BED &a, const int &numOverlapsFound);
 	
-	// instance of a bed file class.
-	BedFile *bedA, *bedB;
-
 };
 
 #endif /* INTERSECTBED_H */
diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp
index fce77147..086cf603 100755
--- a/src/intersectBed/intersectMain.cpp
+++ b/src/intersectBed/intersectMain.cpp
@@ -44,6 +44,7 @@ int main(int argc, char* argv[]) {
 	bool writeB             = false;
 	bool writeCount         = false;
 	bool writeOverlap       = false;
+	bool writeAllOverlap    = false;
 	bool haveFraction       = false;
 	bool reciprocalFraction = false;
 	bool forceStrand        = false;
@@ -114,6 +115,10 @@ int main(int argc, char* argv[]) {
 		else if(PARAMETER_CHECK("-wo", 3, parameterLength)) {
 			writeOverlap = true;
 		}
+		else if(PARAMETER_CHECK("-wao", 4, parameterLength)) {
+			writeAllOverlap = true;
+			writeOverlap = true;
+		}
 		else if(PARAMETER_CHECK("-c", 2, parameterLength)) {
 			writeCount = true;
 		}
@@ -187,7 +192,7 @@ int main(int argc, char* argv[]) {
 	if (!showHelp) {
 
 		BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap,
-											overlapFraction, noHit, writeCount, forceStrand, 
+											writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, 
 											reciprocalFraction, inputIsBam, outputIsBam);
 		bi->DetermineBedInput();
 		return 0;
@@ -221,8 +226,15 @@ void ShowHelp(void) {
 
 	cerr << "\t-wo\t"			<< "Write the original A and B entries plus the number of base" << endl;
 	cerr 						<< "\t\tpairs of overlap between the two features." << endl;
-	cerr						<< "\t\t- Overlaps restricted by -f and -r." << endl << endl;
-		
+	cerr						<< "\t\t- Overlaps restricted by -f and -r." << endl;
+	cerr						<< "\t\t  Only A features with overlap are reported." << endl << endl;
+
+	cerr << "\t-wao\t"			<< "Write the original A and B entries plus the number of base" << endl;
+	cerr 						<< "\t\tpairs of overlap between the two features." << endl;
+	cerr						<< "\t\t- Overlapping features restricted by -f and -r." << endl;
+	cerr						<< "\t\t  However, A features w/o overlap are also reported" << endl << endl;
+	cerr						<< "\t\t  with a NULL B feature and overlap = 0." << endl << endl;
+					
 	cerr << "\t-u\t"      		<< "Write the original A entry _once_ if _any_ overlaps found in B." << endl;
 	cerr 						<< "\t\t- In other words, just report the fact >=1 hit was found." << endl;
 	cerr						<< "\t\t- Overlaps restricted by -f and -r." << endl << endl;
diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp
index c6838214..b72f1196 100755
--- a/src/utils/bedFile/bedFile.cpp
+++ b/src/utils/bedFile/bedFile.cpp
@@ -877,3 +877,67 @@ void BedFile::reportBedRangeNewLine(const BED &bed, int start, int end) {
 
 	}
 }
+
+
+/*
+	reportNullBedTab
+*/
+void BedFile::reportNullBedTab() {
+	
+	if (isGff == false) {
+		if (this->bedType == 3) {
+			printf (".\t-1\t-1\t");
+		}
+		else if (this->bedType == 4) {
+			printf (".\t-1\t-1\t.\t");
+		}
+		else if (this->bedType == 5) {
+			printf (".\t-1\t-1\t.\t-1\t");
+		}
+		else if (this->bedType == 6) {
+			printf (".\t-1\t-1\t.\t-1\t.\t");
+		}
+		else if (this->bedType > 6) {
+			printf (".\t-1\t-1\t.\t-1\t.\t");
+			for (unsigned int i = 6; i < this->bedType; ++i) {
+				printf(".\t");
+			}
+		}
+	}	
+	else if (this->bedType == 9) {
+		printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t");
+	}
+}
+
+
+/*
+	reportNullBedTab
+*/
+void BedFile::reportNullBedNewLine() {
+	
+	if (isGff == false) {
+		if (this->bedType == 3) {
+			printf (".\t-1\t-1\n");
+		}
+		else if (this->bedType == 4) {
+			printf (".\t-1\t-1\t.\n");
+		}
+		else if (this->bedType == 5) {
+			printf (".\t-1\t-1\t.\t-1\n");
+		}
+		else if (this->bedType == 6) {
+			printf (".\t-1\t-1\t.\t-1\t.\n");
+		}
+		else if (this->bedType > 6) {
+			printf (".\t-1\t-1\t.\t-1\t.\n");
+			for (unsigned int i = 6; i < this->bedType; ++i) {
+				printf(".\t");
+			}
+			printf("\n");
+		}
+	}	
+	else if (this->bedType == 9) {
+		printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n");
+	}
+}
+
diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h
index a0a916a5..86eef4cb 100755
--- a/src/utils/bedFile/bedFile.h
+++ b/src/utils/bedFile/bedFile.h
@@ -176,6 +176,8 @@ public:
 	void reportBedNewLine(const BED &);		
 	void reportBedRangeTab(const BED &bed, int start, int end);
 	void reportBedRangeNewLine(const BED &bed, int start, int end);
+	void reportNullBedTab(void);
+	void reportNullBedNewLine(void);
 	
 	// parse an input line and determine how it should be handled
 	bool parseLine (BED &bed, const vector<string> &lineVector, int &lineNum);
-- 
GitLab