From 0da6473fee87a82fc10170b5f942bf37ac574c25 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Mon, 22 Mar 2010 21:25:16 -0400 Subject: [PATCH] Added native BAM support to coverageBed. --- src/coverageBed/coverageBed.cpp | 83 ++++++++++++++++++++++++-------- src/coverageBed/coverageBed.h | 19 +++++++- src/coverageBed/coverageMain.cpp | 39 ++++++++++----- 3 files changed, 107 insertions(+), 34 deletions(-) diff --git a/src/coverageBed/coverageBed.cpp b/src/coverageBed/coverageBed.cpp index 7d93de37..d07e7261 100755 --- a/src/coverageBed/coverageBed.cpp +++ b/src/coverageBed/coverageBed.cpp @@ -12,8 +12,8 @@ #include "lineFileUtilities.h" #include "coverageBed.h" - -BedCoverage::BedCoverage(string &bedAFile, string &bedBFile, bool &forceStrand, bool &writeHistogram) { +// build +BedCoverage::BedCoverage(string &bedAFile, string &bedBFile, bool &forceStrand, bool &writeHistogram, bool &bamInput) { this->bedAFile = bedAFile; this->bedBFile = bedBFile; @@ -23,16 +23,17 @@ BedCoverage::BedCoverage(string &bedAFile, string &bedBFile, bool &forceStrand, this->forceStrand = forceStrand; this->writeHistogram = writeHistogram; + this->bamInput = bamInput; } - - +// destroy BedCoverage::~BedCoverage(void) { + delete this->bedA; + delete this->bedB; } - -void BedCoverage::CollectCoverage(istream &bedInput) { +void BedCoverage::CollectCoverageBed(istream &bedInput) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps @@ -63,6 +64,44 @@ void BedCoverage::CollectCoverage(istream &bedInput) { } +void BedCoverage::CollectCoverageBam(string bamFile) { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bedB->loadBedFileIntoMap(); + + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // convert each aligned BAM entry to BED + // and compute coverage on B + BamAlignment bam; + while (reader.GetNextAlignment(bam)) { + + if (bam.IsMapped()) { + + // construct a new BED entry from the current BAM alignment. + BED a; + a.chrom = refs.at(bam.RefID).RefName; + a.start = bam.Position; + a.end = bam.Position + bam.AlignedBases.size(); + a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; + + bedB->countHits(a, this->forceStrand); + } + } + // report the coverage (summary or histogram) for BED B. + ReportCoverage(); + // close the BAM file + reader.Close(); +} + + void BedCoverage::ReportCoverage() { map<unsigned int, unsigned int> allDepthHist; @@ -81,8 +120,8 @@ void BedCoverage::ReportCoverage() { for (; bedItr != bedEnd; ++bedItr) { int zeroDepthCount = 0; - int depth = 0; - int start = min(bedItr->minOverlapStart, bedItr->start); + int depth = 0; + int start = min(bedItr->minOverlapStart, bedItr->start); // track the numnber of bases in the feature covered by // 0, 1, 2, ... n features in A @@ -112,11 +151,11 @@ void BedCoverage::ReportCoverage() { } // Report the coverage for the current interval. - int length = bedItr->end - bedItr->start; + int length = bedItr->end - bedItr->start; totalLength += length; - int nonZeroBases = (length-zeroDepthCount); - float fractCovered = (float) nonZeroBases /length; + int nonZeroBases = (length - zeroDepthCount); + float fractCovered = (float) nonZeroBases / length; if (this->writeHistogram == false) { bedB->reportBedTab(*bedItr); @@ -147,16 +186,22 @@ void BedCoverage::ReportCoverage() { void BedCoverage::DetermineBedInput() { if (bedA->bedFile != "stdin") { // process a file - ifstream beds(bedA->bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); + if (this->bamInput == false) { //bed/gff + ifstream beds(bedA->bedFile.c_str(), ios::in); + if ( !beds ) { + cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + CollectCoverageBed(beds); } - CollectCoverage(beds); + else + CollectCoverageBam(bedA->bedFile); } - else { // process stdin - CollectCoverage(cin); + else { // process stdin + if (this->bamInput == false) + CollectCoverageBed(cin); + else + CollectCoverageBam("stdin"); } } - diff --git a/src/coverageBed/coverageBed.h b/src/coverageBed/coverageBed.h index 3a3cd37b..a4aa2fc6 100755 --- a/src/coverageBed/coverageBed.h +++ b/src/coverageBed/coverageBed.h @@ -13,6 +13,11 @@ #define COVERAGEBED_H #include "bedFile.h" + +#include "BamReader.h" +#include "BamAux.h" +using namespace BamTools; + #include <vector> #include <algorithm> #include <iostream> @@ -30,26 +35,36 @@ class BedCoverage { public: // constructor - BedCoverage(string &bedAFile, string &bedBFile, bool &forceStrand, bool &writeHistogram); + BedCoverage(string &bedAFile, string &bedBFile, bool &forceStrand, bool &writeHistogram, bool &bamInput); // destructor ~BedCoverage(void); - void CollectCoverage(istream &bedInput); + void CollectCoverageBed(istream &bedInput); + + void CollectCoverageBam(string bamFile); void DetermineBedInput(); private: + // input files. string bedAFile; string bedBFile; // instance of a bed file class. BedFile *bedA, *bedB; + // do we care about strandedness when counting coverage? bool forceStrand; + + // should we write a histogram for each feature in B? bool writeHistogram; + // are we dealing with BAM input for "A"? + bool bamInput; + + // private function for reporting coverage information void ReportCoverage(); }; #endif /* COVERAGEBED_H */ diff --git a/src/coverageBed/coverageMain.cpp b/src/coverageBed/coverageMain.cpp index dad0232b..d05cc0c4 100755 --- a/src/coverageBed/coverageMain.cpp +++ b/src/coverageBed/coverageMain.cpp @@ -31,12 +31,13 @@ int main(int argc, char* argv[]) { // input files string bedAFile; string bedBFile; - bool forceStrand = false; - bool writeHistogram = false; - bool haveBedA = false; - bool haveBedB = false; - + // parm flags + bool forceStrand = false; + bool writeHistogram = false; + bool bamInput = false; + bool haveBedA = false; + bool haveBedB = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -64,6 +65,14 @@ int main(int argc, char* argv[]) { i++; } } + if(PARAMETER_CHECK("-abam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bamInput = true; + bedAFile = argv[i + 1]; + i++; + } + } else if(PARAMETER_CHECK("-b", 2, parameterLength)) { if ((i+1) < argc) { haveBedB = true; @@ -91,7 +100,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { - BedCoverage *bg = new BedCoverage(bedAFile, bedBFile, forceStrand, writeHistogram); + BedCoverage *bg = new BedCoverage(bedAFile, bedBFile, forceStrand, writeHistogram, bamInput); bg->DetermineBedInput(); return 0; } @@ -112,21 +121,25 @@ void ShowHelp(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; cerr << "Options: " << endl; + + cerr << "\t-abam\t" << "The A input file is in BAM format." << endl << endl; + cerr << "\t-s\t" << "Force strandedness. That is, only include hits in A that" << endl; cerr << "\t\toverlap B on the same strand." << endl; cerr << "\t\t- By default, hits are included without respect to strand." << endl << endl; - cerr << "\t-hist\t" << "Report a histogram of coverage for each feature in B" << endl << endl; - cerr << "\t\tOutput (tab delimited):" << endl; + cerr << "\t-hist\t" << "Report a histogram of coverage for each feature in B" << endl; + cerr << "\t\tas well as a summary histogram for _all_ features in B." << endl << endl; + cerr << "\t\tOutput (tab delimited) after each feature in B:" << endl; + cerr << "\t\t 1) depth\n\t\t 2) # bases at depth\n\t\t 3) size of B\n\t\t 4) % of B at depth" << endl << endl; cerr << "Default Output: " << endl; cerr << "\t" << " After each entry in B, reports: " << endl; - cerr << "\t 1) The number of features in A that overlapped the B interval." << endl; - cerr << "\t 2) The number of bases in B that had non-zero coverage." << endl; - cerr << "\t 3) The length of the entry in B." << endl; - cerr << "\t 4) The fraction of bases in B that had non-zero coverage." << endl; + cerr << "\t 1) The number of features in A that overlapped the B interval." << endl; + cerr << "\t 2) The number of bases in B that had non-zero coverage." << endl; + cerr << "\t 3) The length of the entry in B." << endl; + cerr << "\t 4) The fraction of bases in B that had non-zero coverage." << endl << endl; exit(1); - } -- GitLab