From 7527f02d666d88701a7b6e0057a2a8ab4915a637 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Thu, 15 Apr 2010 09:45:16 -0400 Subject: [PATCH] Updated all of the tools to use the Open/Get/Close idiom. - also standardized the naming convention for private variables. - loadIntoMap methods use Open/Get/Close idiom. --- src/annotateBed/annotateBed.cpp | 136 ++++++++++++++++++++++++++++ src/annotateBed/annotateBed.h | 53 +++++++++++ src/annotateBed/annotateMain.cpp | 124 +++++++++++++++++++++++++ src/closestBed/closestMain.cpp | 1 + src/subtractBed/subtractBed.cpp | 83 +++++++---------- src/subtractBed/subtractBed.h | 29 +++--- src/subtractBed/subtractMain.cpp | 2 +- src/utils/bedFile/bedFile.cpp | 150 ++++++++----------------------- src/utils/bedFile/bedFile.h | 19 ++-- src/utils/gzstream/gzstream.h | 121 +++++++++++++++++++++++++ src/windowBed/windowBed.cpp | 103 ++++++++------------- src/windowBed/windowBed.h | 34 ++++--- src/windowBed/windowMain.cpp | 2 +- 13 files changed, 582 insertions(+), 275 deletions(-) create mode 100755 src/annotateBed/annotateBed.cpp create mode 100755 src/annotateBed/annotateBed.h create mode 100755 src/annotateBed/annotateMain.cpp create mode 100644 src/utils/gzstream/gzstream.h diff --git a/src/annotateBed/annotateBed.cpp b/src/annotateBed/annotateBed.cpp new file mode 100755 index 00000000..b270c8e3 --- /dev/null +++ b/src/annotateBed/annotateBed.cpp @@ -0,0 +1,136 @@ +/***************************************************************************** + annotateBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "annotateBed.h" + + +BedAnnotate::BedAnnotate (const string &bedFile, const vector<string> &annotationFiles, bool &forceStrand) { + + this->bedFile = bedFile; + this->annotationFiles = annotationFiles; + + this->bed = new BedFile(bedFile); + this->forceStrand = forceStrand; +} + + + +BedAnnotate::~BedAnnotate (void) { +} + + +void BedAnnotate::ProcessAnnotations (istream &bedInput) { + + // loop through each of the annotation files and compute the coverage + // of each with respect to the input BED file. + vector<string>::const_iterator annotItr = this->annotationFiles.begin(); + vector<string>::const_iterator annotEnd = this->annotationFiles.end(); + for (; annotItr != annotEnd; ++annotItr) { + // create a BED file of the current annotation file. + BedFile annotation = new BedFile(*annotItr); + // compute the coverage of the annotation file with respect to the input file. + GetCoverage(annotation, bedInput); + } +} + + +void BedAnnotate::GetCoverage (const BedFile &annotation, istream &bedInput) { + + // load the annotation bed file into a map so + // that we can easily compare "A" to it for overlaps + annotation->loadBedFileIntoMap(); + + string bedLine; + int lineNum = 0; // current input line number + vector<string> bedFields; // vector for a BED entry + bedFields.reserve(12); + + // process each entry in A + while (getline(bedInput, bedLine)) { + + lineNum++; + Tokenize(bedLine,bedFields); + BED a; + + if (bedA->parseLine(a, bedFields, lineNum)) { + // count a as a hit with all the relevant features in B + bedB->countHits(a, this->forceStrand); + } + // reset for the next input line + bedFields.clear(); + } + + // now, report the count of hits for each feature in B. + masterBedMap::const_iterator chromItr = bedB->bedMap.begin(); + masterBedMap::const_iterator chromEnd = bedB->bedMap.end(); + for (; chromItr != chromEnd; ++chromItr) { + + binsToBeds::const_iterator binItr = chromItr->second.begin(); + binsToBeds::const_iterator binEnd = chromItr->second.end(); + for (; binItr != binEnd; ++binItr) { + + vector<BED>::const_iterator bedItr = binItr->second.begin(); + vector<BED>::const_iterator bedEnd = binItr->second.end(); + for (; bedItr != bedEnd; ++bedItr) { + + int zeroDepthCount = 0; + int depth = 0; + int start = min(bedItr->minOverlapStart, bedItr->start); + + for (int pos = start+1; pos <= bedItr->end; pos++) { + + if (bedItr->depthMap.find(pos) != bedItr->depthMap.end()) { + + map<unsigned int, DEPTH> dMap = bedItr->depthMap; + depth += dMap[pos].starts; + + if ((depth == 0) && (pos > bedItr->start) && (pos <= bedItr->end)) { + zeroDepthCount++; + } + + depth = depth - dMap[pos].ends; + } + else { + if ((depth == 0) && (pos > bedItr->start) && (pos <= bedItr->end)) { + zeroDepthCount++; + } + } + } + + // Report the coverage for the current interval. + int length = bedItr->end - bedItr->start; + int nonZeroBases = (length-zeroDepthCount); + float fractCovered = (float) nonZeroBases /length; + + bedB->reportBedTab(*bedItr); + printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, nonZeroBases, length, fractCovered); + } + } + } +} + + +void BedAnnotate::DetermineBedInput () { + if (bedA->bedFile != "stdin") { // process a file + ifstream beds(bedA->bedFile.c_str(), ios::in); + if ( !beds ) { + cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + GetCoverage(beds); + } + else { // process stdin + GetCoverage(cin); + } +} + + diff --git a/src/annotateBed/annotateBed.h b/src/annotateBed/annotateBed.h new file mode 100755 index 00000000..25d340a0 --- /dev/null +++ b/src/annotateBed/annotateBed.h @@ -0,0 +1,53 @@ +/***************************************************************************** + coverageBed.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ +#ifndef COVERAGEBED_H +#define COVERAGEBED_H + +#include "bedFile.h" +#include <vector> +#include <algorithm> +#include <iostream> +#include <iomanip> +#include <fstream> +#include <stdlib.h> + +using namespace std; + +//************************************************ +// Class methods and elements +//************************************************ +class BedCoverage { + +public: + + // constructor + BedCoverage(string &, string &, bool &); + + // destructor + ~BedCoverage(void); + + void GetCoverage(istream &bedInput); + + void DetermineBedInput(); + +private: + + string bedAFile; + string bedBFile; + + // instance of a bed file class. + BedFile *bedA, *bedB; + + bool forceStrand; + +}; +#endif /* COVERAGEBED_H */ diff --git a/src/annotateBed/annotateMain.cpp b/src/annotateBed/annotateMain.cpp new file mode 100755 index 00000000..db41fc28 --- /dev/null +++ b/src/annotateBed/annotateMain.cpp @@ -0,0 +1,124 @@ +/***************************************************************************** + coverageMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ +#include "coverageBed.h" +#include "version.h" + +using namespace std; + +// define the version +#define PROGRAM_NAME "coverageBed" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void ShowHelp(void); + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + bool forceStrand = false; + + bool haveBedA = false; + bool haveBedB = false; + + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + BedCoverage *bg = new BedCoverage(bedAFile, bedBFile, forceStrand); + bg->DetermineBedInput(); + return 0; + } + else { + ShowHelp(); + } +} + +void ShowHelp(void) { + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Returns the depth and breadth of coverage of features from A" << endl; + cerr << "\t on the intervals in B." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-s\t" << "Force strandedness. That is, only include hits in A that" << endl; + cerr << "\t\toverlap B on the same strand." << endl; + cerr << "\t\t- By default, hits are included without respect to strand." << endl << endl; + + cerr << "Output: " << endl; + cerr << "\t" << " After each entry in B, reports: " << endl; + cerr << "\t 1) The number of features in A that overlapped the B interval." << endl; + cerr << "\t 2) The number of bases in B that had non-zero coverage." << endl; + cerr << "\t 3) The length of the entry in B." << endl; + cerr << "\t 4) The fraction of bases in B that had non-zero coverage." << endl; + + exit(1); + +} diff --git a/src/closestBed/closestMain.cpp b/src/closestBed/closestMain.cpp index 6cafec77..c076bd99 100755 --- a/src/closestBed/closestMain.cpp +++ b/src/closestBed/closestMain.cpp @@ -98,6 +98,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { BedClosest *bc = new BedClosest(bedAFile, bedBFile, forceStrand, tieMode); + delete bc; return 0; } else { diff --git a/src/subtractBed/subtractBed.cpp b/src/subtractBed/subtractBed.cpp index c3c82de7..0bc3f7f3 100755 --- a/src/subtractBed/subtractBed.cpp +++ b/src/subtractBed/subtractBed.cpp @@ -18,13 +18,15 @@ */ BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction, bool &forceStrand) { - this->bedAFile = bedAFile; - this->bedBFile = bedBFile; - this->overlapFraction = overlapFraction; - this->forceStrand = forceStrand; + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _overlapFraction = overlapFraction; + _forceStrand = forceStrand; - this->bedA = new BedFile(bedAFile); - this->bedB = new BedFile(bedBFile); + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + SubtractBed(); } @@ -35,10 +37,10 @@ BedSubtract::~BedSubtract(void) { } -void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { +void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { // find all of the overlaps between a and B. - bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, this->forceStrand); + _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); // is A completely spanned by an entry in B? // if so, A should not be reported. @@ -64,7 +66,7 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { numOverlaps++; numConsumedByB++; } - else if ( overlap >= this->overlapFraction ) { + else if ( overlap >= _overlapFraction ) { numOverlaps++; bOverlaps.push_back(*h); } @@ -73,7 +75,7 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { if (numOverlaps == 0) { // no overlap found, so just report A as-is. - bedA->reportBedNewLine(a); + _bedA->reportBedNewLine(a); } else if (numOverlaps == 1) { // one overlap found. only need to look at the single @@ -88,26 +90,26 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { // B ---- // Res. ==== ==== if ( (theHit.start > a.start) && (theHit.end < a.end) ) { - bedA->reportBedRangeNewLine(a,a.start,theHit.start); - bedA->reportBedRangeNewLine(a,theHit.end,a.end); + _bedA->reportBedRangeNewLine(a,a.start,theHit.start); + _bedA->reportBedRangeNewLine(a,theHit.end,a.end); } // A ++++++++++++ // B ---------- // Res. == else if (theHit.start == a.start) { - bedA->reportBedRangeNewLine(a,theHit.end,a.end); + _bedA->reportBedRangeNewLine(a,theHit.end,a.end); } // A ++++++++++++ // B ---------- // Res. ==== else if (theHit.start < a.start) { - bedA->reportBedRangeNewLine(a,theHit.end,a.end); + _bedA->reportBedRangeNewLine(a,theHit.end,a.end); } // A ++++++++++++ // B ---------- // Res. ======= else if (theHit.start > a.start) { - bedA->reportBedRangeNewLine(a,a.start,theHit.start); + _bedA->reportBedRangeNewLine(a,a.start,theHit.start); } } } @@ -137,7 +139,7 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { } int blockEnd = i + a.start; blockEnd = min(a.end, blockEnd); - bedA->reportBedRangeNewLine(a,blockStart,blockEnd); + _bedA->reportBedRangeNewLine(a,blockStart,blockEnd); } } } @@ -146,52 +148,27 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { -void BedSubtract::SubtractBed(istream &bedInput) { +void BedSubtract::SubtractBed() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps - bedB->loadBedFileIntoMap(); + _bedB->loadBedFileIntoMap(); - string bedLine; + BED a; int lineNum = 0; // current input line number - vector<BED> hits; // vector of potential hits - vector<string> bedFields; // vector for a BED entry - + vector<BED> hits; // vector of potential hits // reserve some space hits.reserve(100); - bedFields.reserve(12); - - // process each entry in A - while (getline(bedInput, bedLine)) { - - lineNum++; - Tokenize(bedLine,bedFields); - BED a; - - // find the overlaps with B if it's a valid BED entry. - if (bedA->parseLine(a, bedFields, lineNum)) { - FindOverlaps(a, hits); - hits.clear(); - } - - // reset for the next input line - bedFields.clear(); + + // + _bedA->Open(); + while (_bedA->GetNextBed(a, lineNum)) { + FindAndSubtractOverlaps(a, hits); + hits.clear(); } + _bedA->Close(); + } // END Intersect -void BedSubtract::DetermineBedInput() { - if (bedA->bedFile != "stdin") { // process a file - ifstream beds(bedA->bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - SubtractBed(beds); - } - else { // process stdin - SubtractBed(cin); - } -} - diff --git a/src/subtractBed/subtractBed.h b/src/subtractBed/subtractBed.h index b11d8619..d60374a7 100755 --- a/src/subtractBed/subtractBed.h +++ b/src/subtractBed/subtractBed.h @@ -27,29 +27,26 @@ class BedSubtract { public: // constructor - BedSubtract(string &, string &, float &, bool &); + BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction, bool &forceStrand); // destructor ~BedSubtract(void); - - void reportARemainder(BED &, int &, int &); - void reportA(BED &); - void FindOverlaps(BED &, vector<BED> &); - void SubtractBed(istream &bedInput); - void DetermineBedInput(); - private: - string bedAFile; - string bedBFile; - float overlapFraction; - bool noHit; - bool forceStrand; + // processing variables + string _bedAFile; + string _bedBFile; + float _overlapFraction; + bool _noHit; + bool _forceStrand; - // instance of a bed file class. - BedFile *bedA, *bedB; - + // instances of bed file class. + BedFile *_bedA, *_bedB; + + // methods + void FindAndSubtractOverlaps(BED &a, vector<BED> &hits); + void SubtractBed(); }; #endif /* SUBTRACTBED_H */ diff --git a/src/subtractBed/subtractMain.cpp b/src/subtractBed/subtractMain.cpp index 3efccc19..8f635115 100755 --- a/src/subtractBed/subtractMain.cpp +++ b/src/subtractBed/subtractMain.cpp @@ -99,7 +99,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { BedSubtract *bs = new BedSubtract(bedAFile, bedBFile, overlapFraction, forceStrand); - bs->DetermineBedInput(); + delete bs; return 0; } else { diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index fc5ec7d7..d0457bf3 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -112,7 +112,7 @@ BedFile::~BedFile(void) { void BedFile::Open(void) { if (bedFile == "stdin") { - bedStream = &cin; + _bedStream = &cin; } else { size_t foundPos; @@ -130,7 +130,7 @@ void BedFile::Open(void) { // now set a pointer to the stream so that we // can read the file later on. // Thank God for Josuttis, p. 631! - bedStream = new igzstream(bedFile.c_str(), ios::in); + _bedStream = new igzstream(bedFile.c_str(), ios::in); } } // not GZIPPED. @@ -148,7 +148,7 @@ void BedFile::Open(void) { // now set a pointer to the stream so that we // can read the file later on. // Thank God for Josuttis, p. 631! - bedStream = new ifstream(bedFile.c_str(), ios::in); + _bedStream = new ifstream(bedFile.c_str(), ios::in); } } } @@ -157,7 +157,7 @@ void BedFile::Open(void) { // Close the BED file void BedFile::Close(void) { - delete bedStream; + delete _bedStream; } @@ -165,13 +165,13 @@ bool BedFile::GetNextBed (BED &bed, int &lineNum) { // make sure there are still lines to process. // if so, tokenize, validate and return the BED entry. - if (bedStream->good()) { + if (_bedStream->good()) { string bedLine; vector<string> bedFields; bedFields.reserve(12); // parse the bedStream pointer - getline(*bedStream, bedLine); + getline(*_bedStream, bedLine); lineNum++; // split into a string vector. @@ -356,8 +356,8 @@ void BedFile::countHits(const BED &a, bool forceStrand) { void BedFile::setGff (bool gff) { - if (gff == true) this->isGff = true; - else this->isGff = false; + if (gff == true) this->_isGff = true; + else this->_isGff = false; } @@ -577,7 +577,7 @@ bool BedFile::parseGffLine (BED &bed, const vector<string> &lineVector, int line */ if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { - if (this->bedType == 9 && isGff) { + if (this->bedType == 9 && _isGff) { bed.chrom = lineVector[0]; // substract 1 to force the start to be BED-style bed.start = atoi(lineVector[3].c_str()) - 1; @@ -608,7 +608,7 @@ bool BedFile::parseGffLine (BED &bed, const vector<string> &lineVector, int line else if ((lineNum == 1) && (lineVector.size() == 9)) { this->bedType = lineVector.size(); - if (this->bedType == 9 && isGff) { + if (this->bedType == 9 && _isGff) { bed.chrom = lineVector[0]; // substract 1 to force the start to be BED-style bed.start = atoi(lineVector[3].c_str()) - 1; @@ -624,7 +624,7 @@ bool BedFile::parseGffLine (BED &bed, const vector<string> &lineVector, int line return true; } else { - cout << this->bedType << " " << isGff << endl; + cout << this->bedType << " " << _isGff << endl; cerr << "Error: unexpected number of fields at line: " << lineNum << ". Verify that your files are TAB-delimited and that your GFF file has 9 fields. Exiting..." << endl; exit(1); @@ -658,112 +658,32 @@ bool BedFile::parseGffLine (BED &bed, const vector<string> &lineVector, int line void BedFile::loadBedFileIntoMap() { - string bedLine; + BED bedEntry; int lineNum = 0; - vector<string> bedFields; // vector for a BED entry - bedFields.reserve(12); // reserve the max BED size - // Case 1: Proper BED File. - if ( (this->bedFile != "") && (this->bedFile != "stdin") ) { - - // open the BED file for reading - ifstream bed(bedFile.c_str(), ios::in); - if ( !bed ) { - cerr << "Error: The requested bed file (" <<bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - - while (getline(bed, bedLine)) { - - Tokenize(bedLine,bedFields); // load the fields into the vector - lineNum++; - - BED bedEntry; // new BED struct for the current line - if (parseLine(bedEntry, bedFields, lineNum)) { - int bin = getBin(bedEntry.start, bedEntry.end); - bedEntry.count = 0; - bedEntry.minOverlapStart = INT_MAX; - - //string key = getChromBinKey(bedEntry.chrom, bin); - //this->bedMap[key].push_back(bedEntry); - this->bedMap[bedEntry.chrom][bin].push_back(bedEntry); - } - bedFields.clear(); - } - } - else { - while (getline(cin, bedLine)) { - - Tokenize(bedLine,bedFields); // load the fields into the vector - lineNum++; - - BED bedEntry; // new BED struct for the current line - if (parseLine(bedEntry, bedFields, lineNum)) { - int bin = getBin(bedEntry.start, bedEntry.end); - bedEntry.count = 0; - bedEntry.minOverlapStart = INT_MAX; - this->bedMap[bedEntry.chrom][bin].push_back(bedEntry); - } - bedFields.clear(); - } - } + Open(); + while (this->GetNextBed(bedEntry, lineNum)) { + int bin = getBin(bedEntry.start, bedEntry.end); + bedEntry.count = 0; + bedEntry.minOverlapStart = INT_MAX; + bedMap[bedEntry.chrom][bin].push_back(bedEntry); + } + Close(); } void BedFile::loadBedFileIntoMapNoBin() { - - string bedLine; + + BED bedEntry; int lineNum = 0; - vector<string> bedFields; // vector for a BED entry - bedFields.reserve(12); // reserve the max BED size - - // Case 1: Proper BED File. - if ( (this->bedFile != "") && (this->bedFile != "stdin") ) { - - // open the BED file for reading - ifstream bed(bedFile.c_str(), ios::in); - if ( !bed ) { - cerr << "Error: The requested bed file (" <<bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - - while (getline(bed, bedLine)) { - - Tokenize(bedLine,bedFields); // load the fields into the vector - lineNum++; - - BED bedEntry; // new BED struct for the current line - if (parseLine(bedEntry, bedFields, lineNum)) { - bedEntry.count = 0; - bedEntry.minOverlapStart = INT_MAX; - this->bedMapNoBin[bedEntry.chrom].push_back(bedEntry); - } - bedFields.clear(); - } - } - // Case 2: STDIN. - else { - - while (getline(cin, bedLine)) { - - Tokenize(bedLine,bedFields); // load the fields into the vector - lineNum++; - - BED bedEntry; // new BED struct for the current line - if (parseLine(bedEntry, bedFields, lineNum)) { - bedEntry.count = 0; - bedEntry.minOverlapStart = INT_MAX; - this->bedMapNoBin[bedEntry.chrom].push_back(bedEntry); - } - bedFields.clear(); - } - } - - // sort the BED entries for each chromosome - // in ascending order of start position - for (masterBedMapNoBin::iterator m = this->bedMapNoBin.begin(); m != this->bedMapNoBin.end(); ++m) { - sort(m->second.begin(), m->second.end(), sortByStart); + + Open(); + while (this->GetNextBed(bedEntry, lineNum)) { + bedEntry.count = 0; + bedEntry.minOverlapStart = INT_MAX; + bedMapNoBin[bedEntry.chrom].push_back(bedEntry); } + Close(); } @@ -776,7 +696,7 @@ void BedFile::loadBedFileIntoMapNoBin() { */ void BedFile::reportBedTab(const BED &bed) { - if (isGff == false) { + if (_isGff == false) { if (this->bedType == 3) { printf ("%s\t%d\t%d\t", bed.chrom.c_str(), bed.start, bed.end); } @@ -822,7 +742,7 @@ void BedFile::reportBedTab(const BED &bed) { */ void BedFile::reportBedNewLine(const BED &bed) { - if (isGff == false) { + if (_isGff == false) { if (this->bedType == 3) { printf ("%s\t%d\t%d\n", bed.chrom.c_str(), bed.start, bed.end); } @@ -870,7 +790,7 @@ void BedFile::reportBedNewLine(const BED &bed) { */ void BedFile::reportBedRangeTab(const BED &bed, int start, int end) { - if (isGff == false) { + if (_isGff == false) { if (this->bedType == 3) { printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); } @@ -917,7 +837,7 @@ void BedFile::reportBedRangeTab(const BED &bed, int start, int end) { */ void BedFile::reportBedRangeNewLine(const BED &bed, int start, int end) { - if (isGff == false) { + if (_isGff == false) { if (this->bedType == 3) { printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); } @@ -959,7 +879,7 @@ void BedFile::reportBedRangeNewLine(const BED &bed, int start, int end) { */ void BedFile::reportNullBedTab() { - if (isGff == false) { + if (_isGff == false) { if (this->bedType == 3) { printf (".\t-1\t-1\t"); } @@ -990,7 +910,7 @@ void BedFile::reportNullBedTab() { */ void BedFile::reportNullBedNewLine() { - if (isGff == false) { + if (_isGff == false) { if (this->bedType == 3) { printf (".\t-1\t-1\n"); } diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index e0cc18d8..4873fb69 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -146,10 +146,13 @@ public: // Destructor ~BedFile(void); + // Open a BED file for reading (creates an istream pointer) void Open(void); + // Close an opened BED file. void Close(void); + // Get the next BED entry in an opened BED file. bool GetNextBed (BED &bed, int &lineNum); // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs @@ -194,22 +197,24 @@ public: string bedFile; unsigned int bedType; // 3-6, 12 for BED // 9 for GFF - - bool isGff; masterBedMap bedMap; masterBedMapNoBin bedMapNoBin; - + private: - // process as line from a BED file - bool parseBedLine (BED &bed, const vector<string> &lineVector, int lineNum); + // data + bool _isGff; + istream *_bedStream; + + // methods + // process as line from a BED file + bool parseBedLine (BED &bed, const vector<string> &lineVector, int lineNum); // process as line from a GFF file. convert to a BED feature. bool parseGffLine (BED &bed, const vector<string> &lineVector, int lineNum); - void setGff (bool isGff); - istream *bedStream; + }; #endif /* BEDFILE_H */ diff --git a/src/utils/gzstream/gzstream.h b/src/utils/gzstream/gzstream.h new file mode 100644 index 00000000..861653f4 --- /dev/null +++ b/src/utils/gzstream/gzstream.h @@ -0,0 +1,121 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.5 $ +// Revision_date : $Date: 2002/04/26 23:30:15 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include <iostream> +#include <fstream> +#include <zlib.h> + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+256; // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); +public: + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff --git a/src/windowBed/windowBed.cpp b/src/windowBed/windowBed.cpp index d6d48b55..99328554 100755 --- a/src/windowBed/windowBed.cpp +++ b/src/windowBed/windowBed.cpp @@ -19,20 +19,23 @@ BedWindow::BedWindow(string &bedAFile, string &bedBFile, int &leftSlop, int &rightSlop, bool &anyHit, bool &noHit, bool &writeCount, bool &strandWindows, bool &matchOnStrand) { - this->bedAFile = bedAFile; - this->bedBFile = bedBFile; + _bedAFile = bedAFile; + _bedBFile = bedBFile; - this->leftSlop = leftSlop; - this->rightSlop = rightSlop; + _leftSlop = leftSlop; + _rightSlop = rightSlop; - this->anyHit = anyHit; - this->noHit = noHit; - this->writeCount = writeCount; - this->strandWindows = strandWindows; - this->matchOnStrand = matchOnStrand; + _anyHit = anyHit; + _noHit = noHit; + _writeCount = writeCount; + _strandWindows = strandWindows; + _matchOnStrand = matchOnStrand; - this->bedA = new BedFile(bedAFile); - this->bedB = new BedFile(bedBFile); + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + // do the work + WindowIntersectBed(); } @@ -60,23 +63,23 @@ void BedWindow::FindWindowOverlaps(BED &a, vector<BED> &hits) { // If so, // if "+", then left is left and right is right // if "-", the left is right and right is left. - if (this->strandWindows) { + if (_strandWindows) { if (a.strand == "+") { - if ((a.start - this->leftSlop) > 0) aFudgeStart = a.start - this->leftSlop; + if ((a.start - _leftSlop) > 0) aFudgeStart = a.start - _leftSlop; else aFudgeStart = 0; - aFudgeEnd = a.end + this->rightSlop; + aFudgeEnd = a.end + _rightSlop; } else { - if ((a.start - this->rightSlop) > 0) aFudgeStart = a.start - this->rightSlop; + if ((a.start - _rightSlop) > 0) aFudgeStart = a.start - _rightSlop; else aFudgeStart = 0; - aFudgeEnd = a.end + this->leftSlop; + aFudgeEnd = a.end + _leftSlop; } } // If not, add the windows irrespective of strand else { - if ((a.start - this->leftSlop) > 0) aFudgeStart = a.start - this->leftSlop; + if ((a.start - _leftSlop) > 0) aFudgeStart = a.start - _leftSlop; else aFudgeStart = 0; - aFudgeEnd = a.end + this->rightSlop; + aFudgeEnd = a.end + _rightSlop; } @@ -84,7 +87,7 @@ void BedWindow::FindWindowOverlaps(BED &a, vector<BED> &hits) { Now report the hits (if any) based on the window around a. */ // get the hits in B for the A feature - bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, this->matchOnStrand); + _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _matchOnStrand); int numOverlaps = 0; @@ -102,70 +105,42 @@ void BedWindow::FindWindowOverlaps(BED &a, vector<BED> &hits) { // is there enough overlap (default ~ 1bp) if ( ((float) overlapBases / (float) aLength) > 0 ) { numOverlaps++; - if (!anyHit && !noHit && !writeCount) { - bedA->reportBedTab(a); - bedB->reportBedNewLine(*h); + if (_anyHit == false && _noHit == false && _writeCount == false) { + _bedA->reportBedTab(a); + _bedB->reportBedNewLine(*h); } } } } - if (anyHit && (numOverlaps >= 1)) { - bedA->reportBedNewLine(a); } - else if (writeCount) { - bedA->reportBedTab(a); printf("\t%d\n", numOverlaps); + if (_anyHit == true && (numOverlaps >= 1)) { + _bedA->reportBedNewLine(a); } + else if (_writeCount == true) { + _bedA->reportBedTab(a); printf("\t%d\n", numOverlaps); } - else if (noHit && (numOverlaps == 0)) { - bedA->reportBedNewLine(a); + else if (_noHit == true && (numOverlaps == 0)) { + _bedA->reportBedNewLine(a); } } -void BedWindow::WindowIntersectBed(istream &bedInput) { +void BedWindow::WindowIntersectBed() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps - bedB->loadBedFileIntoMap(); + _bedB->loadBedFileIntoMap(); - string bedLine; + BED a; int lineNum = 0; // current input line number vector<BED> hits; // vector of potential hits - vector<string> bedFields; // vector for a BED entry - - // reserve some space hits.reserve(100); - bedFields.reserve(12); // process each entry in A - while (getline(bedInput, bedLine)) { - - lineNum++; - Tokenize(bedLine,bedFields); - BED a; - - // find the overlaps between "a" and "B" if "a" is a valid BED entry. - if (bedA->parseLine(a, bedFields, lineNum)) { - FindWindowOverlaps(a, hits); - hits.clear(); - } - - // reset for the next input line - bedFields.clear(); + _bedA->Open(); + while (_bedA->GetNextBed(a, lineNum)) { + FindWindowOverlaps(a, hits); + hits.clear(); } + _bedA->Close(); } -// END WindowIntersectBed -void BedWindow::DetermineBedInput() { - if (bedA->bedFile != "stdin") { // process a file - ifstream beds(bedA->bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - WindowIntersectBed(beds); - } - else { // process stdin - WindowIntersectBed(cin); - } -} - diff --git a/src/windowBed/windowBed.h b/src/windowBed/windowBed.h index ec3700dc..9aa28a84 100755 --- a/src/windowBed/windowBed.h +++ b/src/windowBed/windowBed.h @@ -27,32 +27,30 @@ class BedWindow { public: // constructor - BedWindow(string &, string &, int &, int &, bool &, bool &, bool &, bool &, bool &); + BedWindow(string &bedAFile, string &bedBFile, int &leftSlop, int &rightSlop, bool &anyHit, bool &noHit, + bool &writeCount, bool &strandWindows, bool &matchOnStrand); // destructor ~BedWindow(void); - void reportA(const BED &); - void reportB(const BED &); - - void WindowIntersectBed(istream &bedInput); - void FindWindowOverlaps(BED &, vector<BED> &); - void DetermineBedInput(); - private: - string bedAFile; - string bedBFile; - bool anyHit; - bool writeCount; - int leftSlop; - int rightSlop; - bool noHit; - bool strandWindows; - bool matchOnStrand; + string _bedAFile; + string _bedBFile; + bool _anyHit; + bool _writeCount; + int _leftSlop; + int _rightSlop; + bool _noHit; + bool _strandWindows; + bool _matchOnStrand; // instance of a bed file class. - BedFile *bedA, *bedB; + BedFile *_bedA, *_bedB; + + // methods + void WindowIntersectBed(); + void FindWindowOverlaps(BED &a, vector<BED> &hits); }; #endif /* WINDOWBED_H */ diff --git a/src/windowBed/windowMain.cpp b/src/windowBed/windowMain.cpp index bc8f1e12..2fdc2f70 100755 --- a/src/windowBed/windowMain.cpp +++ b/src/windowBed/windowMain.cpp @@ -162,7 +162,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { BedWindow *bi = new BedWindow(bedAFile, bedBFile, leftSlop, rightSlop, anyHit, noHit, writeCount, strandWindows, matchOnStrand); - bi->DetermineBedInput(); + delete bi; return 0; } else { -- GitLab