Skip to content
Snippets Groups Projects
Commit 27234494 authored by Aaron's avatar Aaron
Browse files

Added the -wao option to intersectBed.

	- Allows one to report 0 overlap for non-overlapping A features
	- Changed all private memeber variables to use "_VAR" notation.
	- Added convenience functions for reporting overlaps.
parent 996ff6bd
No related branches found
No related tags found
No related merge requests found
......@@ -17,27 +17,28 @@
Constructor
*/
BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit,
bool writeA, bool writeB, bool writeOverlap, float overlapFraction,
bool noHit, bool writeCount, bool forceStrand, bool reciprocal,
bool bamInput, bool bamOutput) {
bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
float overlapFraction, bool noHit, bool writeCount, bool forceStrand,
bool reciprocal, bool bamInput, bool bamOutput) {
this->bedAFile = bedAFile;
this->bedBFile = bedBFile;
this->anyHit = anyHit;
this->noHit = noHit;
this->writeA = writeA;
this->writeB = writeB;
this->writeOverlap = writeOverlap;
this->writeCount = writeCount;
this->overlapFraction = overlapFraction;
this->forceStrand = forceStrand;
this->reciprocal = reciprocal;
this->bamInput = bamInput;
this->bamOutput = bamOutput;
_bedAFile = bedAFile;
_bedBFile = bedBFile;
_anyHit = anyHit;
_noHit = noHit;
_writeA = writeA;
_writeB = writeB;
_writeOverlap = writeOverlap;
_writeAllOverlap = writeAllOverlap;
_writeCount = writeCount;
_overlapFraction = overlapFraction;
_forceStrand = forceStrand;
_reciprocal = reciprocal;
_bamInput = bamInput;
_bamOutput = bamOutput;
// create new BED file objects for A and B
this->bedA = new BedFile(bedAFile);
this->bedB = new BedFile(bedBFile);
_bedA = new BedFile(bedAFile);
_bedB = new BedFile(bedBFile);
}
......@@ -53,109 +54,117 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) {
bool hitsFound = false;
// grab _all_ of the features in B that overlap with a.
bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, this->forceStrand);
_bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand);
// how many overlaps are there b/w a and B?
int numOverlaps = 0;
// should we print each overlap, or does the user want summary information?
bool printable = true;
if (anyHit || noHit || writeCount) {
if (_anyHit || _noHit || _writeCount)
printable = false;
}
// loop through the hits and report those that meet the user's criteria
vector<BED>::const_iterator h = hits.begin();
vector<BED>::const_iterator hitsEnd = hits.end();
for (; h != hitsEnd; ++h) {
int s = max(a.start, h->start);
int e = min(a.end, h->end);
int s = max(a.start, h->start);
int e = min(a.end, h->end);
int overlapBases = (e - s); // the number of overlapping bases b/w a and b
int aLength = (a.end - a.start); // the length of a in b.p.
int aLength = (a.end - a.start); // the length of a in b.p.
// is there enough overlap relative to the user's request? (default ~ 1bp)
if ( ( (float) overlapBases / (float) aLength ) >= this->overlapFraction ) {
if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
// Report the hit if the user doesn't care about reciprocal overlap between A and B.
if (!reciprocal) {
if (_reciprocal == false) {
hitsFound = true;
numOverlaps++; // we have another hit for A
if (printable == true) {
if (writeA == false && writeB == false && writeOverlap == false) {
bedA->reportBedRangeNewLine(a,s,e);
}
else if (writeA == true && writeB == true) {
bedA->reportBedTab(a);
bedB->reportBedNewLine(*h);
}
else if (writeA == true) {
bedA->reportBedNewLine(a);
}
else if (writeB == true) {
bedA->reportBedRangeTab(a,s,e);
bedB->reportBedNewLine(*h);
}
else if (writeOverlap == true) {
bedA->reportBedTab(a);
bedB->reportBedTab(*h);
printf("%d\n", overlapBases);
}
}
numOverlaps++;
}
else { // the user wants there to be sufficient reciprocal overlap
int bLength = (h->end - h->start);
// we require there to be sufficient __reciprocal__ overlap
else {
int bLength = (h->end - h->start);
float bOverlap = ( (float) overlapBases / (float) bLength );
if (bOverlap >= this->overlapFraction) {
if (bOverlap >= _overlapFraction) {
hitsFound = true;
numOverlaps++; // we have another hit for A
if (!writeB && printable) {
if (writeA) bedA->reportBedNewLine(a);
else bedA->reportBedRangeNewLine(a,s,e);
}
else if (printable) {
if (writeA) {
bedA->reportBedTab(a);
bedB->reportBedNewLine(*h);
}
else {
bedA->reportBedRangeTab(a,s,e);
bedB->reportBedNewLine(*h);
}
}
numOverlaps++;
}
}
// report the overlap per the user's request.
if (printable == true) {
ReportOverlapDetail(overlapBases, a, *h, s, e);
}
}
}
if (anyHit && (numOverlaps >= 1)) {
bedA->reportBedNewLine(a);
// report the summary of the overlaps if requested.
ReportOverlapSummary(a, numOverlaps);
// were hits found for this BED feature?
return hitsFound;
}
void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b,
const int &s, const int &e) {
// simple intersection only
if (_writeA == false && _writeB == false && _writeOverlap == false) {
_bedA->reportBedRangeNewLine(a,s,e);
}
// write the original A and B
else if (_writeA == true && _writeB == true) {
_bedA->reportBedTab(a);
_bedB->reportBedNewLine(b);
}
else if (writeCount) {
bedA->reportBedTab(a);
printf("%d\n", numOverlaps);
// write just the original A
else if (_writeA == true) {
_bedA->reportBedNewLine(a);
}
else if (noHit && (numOverlaps == 0)) {
bedA->reportBedNewLine(a);
// write the intersected portion of A and the original B
else if (_writeB == true) {
_bedA->reportBedRangeTab(a,s,e);
_bedB->reportBedNewLine(b);
}
// write the original A and B plus the no. of overlapping bases.
else if (_writeOverlap == true) {
_bedA->reportBedTab(a);
_bedB->reportBedTab(b);
printf("%d\n", overlapBases);
}
}
return hitsFound;
void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) {
// -u just report the fact that there was >= 1 overlaps
if (_anyHit && (numOverlapsFound >= 1)) {
_bedA->reportBedNewLine(a);
}
// -c report the total number of features overlapped in B
else if (_writeCount) {
_bedA->reportBedTab(a);
printf("%d\n", numOverlapsFound);
}
// -v report iff there were no overlaps
else if (_noHit && (numOverlapsFound == 0)) {
_bedA->reportBedNewLine(a);
}
// -wao the user wants to force the reporting of 0 overlap
else if (_writeAllOverlap && (numOverlapsFound == 0)) {
_bedA->reportBedTab(a);
_bedB->reportNullBedTab();
printf("0\n");
}
}
bool BedIntersect::FindOneOrMoreOverlap(const BED &a) {
bool overlapsFound;
if (this->reciprocal == false) {
overlapsFound = bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
this->forceStrand, this->overlapFraction);
if (_reciprocal == false) {
overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
_forceStrand, _overlapFraction);
}
else {
overlapsFound = bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
this->forceStrand, this->overlapFraction);
overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
_forceStrand, _overlapFraction);
}
return overlapsFound;
}
......@@ -165,7 +174,7 @@ void BedIntersect::IntersectBed(istream &bedInput) {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bedB->loadBedFileIntoMap();
_bedB->loadBedFileIntoMap();
string bedLine;
int lineNum = 0; // current input line number
vector<BED> hits; // vector of potential hits
......@@ -182,7 +191,7 @@ void BedIntersect::IntersectBed(istream &bedInput) {
Tokenize(bedLine,bedFields);
BED a;
// find the overlaps with B if it's a valid BED entry.
if (bedA->parseLine(a, bedFields, lineNum)) {
if (_bedA->parseLine(a, bedFields, lineNum)) {
FindOverlaps(a, hits);
hits.clear();
}
......@@ -196,7 +205,7 @@ void BedIntersect::IntersectBam(string bamFile) {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bedB->loadBedFileIntoMap();
_bedB->loadBedFileIntoMap();
// open the BAM file
BamReader reader;
......@@ -204,11 +213,11 @@ void BedIntersect::IntersectBam(string bamFile) {
reader.Open(bamFile);
// get header & reference information
string header = reader.GetHeaderText();
string header = reader.GetHeaderText();
RefVector refs = reader.GetReferenceData();
// open a BAM output to stdout if we are writing BAM
if (this->bamOutput == true) {
if (_bamOutput == true) {
// open our BAM writer
writer.Open("stdout", header, refs);
}
......@@ -217,7 +226,7 @@ void BedIntersect::IntersectBam(string bamFile) {
// reserve some space
hits.reserve(100);
bedA->bedType = 6;
_bedA->bedType = 6;
BamAlignment bam;
bool overlapsFound;
// get each set of alignments for each pair.
......@@ -227,23 +236,25 @@ void BedIntersect::IntersectBam(string bamFile) {
BED a;
a.chrom = refs.at(bam.RefID).RefName;
a.start = bam.Position;
a.end = bam.Position + bam.AlignedBases.size();
a.end = bam.Position + bam.AlignedBases.size();
// build the name field from the BAM alignment.
a.name = bam.Name;
if (bam.IsFirstMate()) a.name += "/1";
if (bam.IsSecondMate()) a.name += "/2";
a.score = ToString(bam.MapQuality);
a.score = ToString(bam.MapQuality);
a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-";
if (this->bamOutput == true) {
if (_bamOutput == true) {
overlapsFound = FindOneOrMoreOverlap(a);
if (overlapsFound == true) {
if (!this->noHit) writer.SaveAlignment(bam);
if (_noHit == false)
writer.SaveAlignment(bam);
}
else {
if (this->noHit) writer.SaveAlignment(bam);
if (_noHit == false)
writer.SaveAlignment(bam);
}
}
else {
......@@ -253,8 +264,9 @@ void BedIntersect::IntersectBam(string bamFile) {
}
}
// close the relevant BAM files.
reader.Close();
if (this->bamOutput == true) {
if (_bamOutput == true) {
writer.Close();
}
}
......@@ -262,23 +274,29 @@ void BedIntersect::IntersectBam(string bamFile) {
void BedIntersect::DetermineBedInput() {
if (bedA->bedFile != "stdin") { // process a file
if (this->bamInput == false) { //bed/gff
ifstream beds(bedA->bedFile.c_str(), ios::in);
// dealing with a proper file
if (_bedA->bedFile != "stdin") {
// it's BED or GFF
if (_bamInput == false) {
ifstream beds(_bedA->bedFile.c_str(), ios::in);
if ( !beds ) {
cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl;
cerr << "Error: The requested bed file (" << _bedA->bedFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
IntersectBed(beds);
}
else { // bam
IntersectBam(bedA->bedFile);
// it's BAM
else {
IntersectBam(_bedA->bedFile);
}
}
else { // process stdin
if (this->bamInput == false) { //bed/gff
// reading from stdin
else {
// it's BED or GFF
if (_bamInput == false) {
IntersectBed(cin);
}
// it's BAM
else {
IntersectBam("stdin");
}
......
......@@ -13,67 +13,74 @@
#define INTERSECTBED_H
#include "bedFile.h"
#include "BamReader.h"
#include "BamWriter.h"
#include "BamAux.h"
using namespace BamTools;
#include <vector>
#include <iostream>
#include <fstream>
#include <stdlib.h>
using namespace std;
//************************************************
// Class methods and elements
//************************************************
class BedIntersect {
public:
// constructor
BedIntersect(string bedAFile, string bedBFile, bool anyHit,
bool writeA, bool writeB, bool writeOverlap, float overlapFraction,
bool noHit, bool writeCount, bool forceStrand, bool reciprocal,
bool bamInput, bool bamOutput);
bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
float overlapFraction, bool noHit, bool writeCount, bool forceStrand,
bool reciprocal, bool bamInput, bool bamOutput);
// destructor
~BedIntersect(void);
void DetermineBedInput();
void reportAIntersect(const BED &, int &, int &);
void reportA(const BED &);
void reportB(const BED &);
private:
//------------------------------------------------
// private attributes
//------------------------------------------------
string _bedAFile;
string _bedBFile;
bool _writeA; // should the original A feature be reported?
bool _writeB; // should the original B feature be reported?
bool _writeOverlap;
bool _writeAllOverlap;
bool FindOverlaps(const BED &a, vector<BED> &hits);
bool FindOneOrMoreOverlap(const BED &a);
bool _forceStrand;
bool _reciprocal;
float _overlapFraction;
bool _anyHit;
bool _noHit;
bool _writeCount; // do we want a count of the number of overlaps in B?
bool _bamInput;
bool _bamOutput;
// instance of a bed file class.
BedFile *_bedA, *_bedB;
//------------------------------------------------
// private methods
//------------------------------------------------
void IntersectBed(istream &bedInput);
void IntersectBam(string bamFile);
void DetermineBedInput();
private:
bool FindOverlaps(const BED &a, vector<BED> &hits);
bool FindOneOrMoreOverlap(const BED &a);
string bedAFile;
string bedBFile;
string notInBFile;
bool anyHit;
bool writeA;
bool writeB;
bool writeCount;
bool writeOverlap;
bool forceStrand;
bool reciprocal;
float overlapFraction;
bool noHit;
bool bamInput;
bool bamOutput;
void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b,
const int &s, const int &e);
void ReportOverlapSummary(const BED &a, const int &numOverlapsFound);
// instance of a bed file class.
BedFile *bedA, *bedB;
};
#endif /* INTERSECTBED_H */
......@@ -44,6 +44,7 @@ int main(int argc, char* argv[]) {
bool writeB = false;
bool writeCount = false;
bool writeOverlap = false;
bool writeAllOverlap = false;
bool haveFraction = false;
bool reciprocalFraction = false;
bool forceStrand = false;
......@@ -114,6 +115,10 @@ int main(int argc, char* argv[]) {
else if(PARAMETER_CHECK("-wo", 3, parameterLength)) {
writeOverlap = true;
}
else if(PARAMETER_CHECK("-wao", 4, parameterLength)) {
writeAllOverlap = true;
writeOverlap = true;
}
else if(PARAMETER_CHECK("-c", 2, parameterLength)) {
writeCount = true;
}
......@@ -187,7 +192,7 @@ int main(int argc, char* argv[]) {
if (!showHelp) {
BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap,
overlapFraction, noHit, writeCount, forceStrand,
writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand,
reciprocalFraction, inputIsBam, outputIsBam);
bi->DetermineBedInput();
return 0;
......@@ -221,8 +226,15 @@ void ShowHelp(void) {
cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl;
cerr << "\t\tpairs of overlap between the two features." << endl;
cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl;
cerr << "\t\t- Overlaps restricted by -f and -r." << endl;
cerr << "\t\t Only A features with overlap are reported." << endl << endl;
cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl;
cerr << "\t\tpairs of overlap between the two features." << endl;
cerr << "\t\t- Overlapping features restricted by -f and -r." << endl;
cerr << "\t\t However, A features w/o overlap are also reported" << endl << endl;
cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl;
cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl;
cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl;
cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl;
......
......@@ -877,3 +877,67 @@ void BedFile::reportBedRangeNewLine(const BED &bed, int start, int end) {
}
}
/*
reportNullBedTab
*/
void BedFile::reportNullBedTab() {
if (isGff == false) {
if (this->bedType == 3) {
printf (".\t-1\t-1\t");
}
else if (this->bedType == 4) {
printf (".\t-1\t-1\t.\t");
}
else if (this->bedType == 5) {
printf (".\t-1\t-1\t.\t-1\t");
}
else if (this->bedType == 6) {
printf (".\t-1\t-1\t.\t-1\t.\t");
}
else if (this->bedType > 6) {
printf (".\t-1\t-1\t.\t-1\t.\t");
for (unsigned int i = 6; i < this->bedType; ++i) {
printf(".\t");
}
}
}
else if (this->bedType == 9) {
printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t");
}
}
/*
reportNullBedTab
*/
void BedFile::reportNullBedNewLine() {
if (isGff == false) {
if (this->bedType == 3) {
printf (".\t-1\t-1\n");
}
else if (this->bedType == 4) {
printf (".\t-1\t-1\t.\n");
}
else if (this->bedType == 5) {
printf (".\t-1\t-1\t.\t-1\n");
}
else if (this->bedType == 6) {
printf (".\t-1\t-1\t.\t-1\t.\n");
}
else if (this->bedType > 6) {
printf (".\t-1\t-1\t.\t-1\t.\n");
for (unsigned int i = 6; i < this->bedType; ++i) {
printf(".\t");
}
printf("\n");
}
}
else if (this->bedType == 9) {
printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n");
}
}
......@@ -176,6 +176,8 @@ public:
void reportBedNewLine(const BED &);
void reportBedRangeTab(const BED &bed, int start, int end);
void reportBedRangeNewLine(const BED &bed, int start, int end);
void reportNullBedTab(void);
void reportNullBedNewLine(void);
// parse an input line and determine how it should be handled
bool parseLine (BED &bed, const vector<string> &lineVector, int &lineNum);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment