From 2fea1f04605a8dd6a4c3e07c8eb9d98c9d10123a Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Mon, 22 Mar 2010 21:56:05 -0400
Subject: [PATCH] Added native BAM support to genomeCoverageBed.

---
 src/genomeCoverageBed/genomeCoverageBed.cpp  | 202 ++++++++++++++-----
 src/genomeCoverageBed/genomeCoverageBed.h    |  24 ++-
 src/genomeCoverageBed/genomeCoverageMain.cpp |  21 +-
 3 files changed, 190 insertions(+), 57 deletions(-)

diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp
index 80191b1b..7098c376 100755
--- a/src/genomeCoverageBed/genomeCoverageBed.cpp
+++ b/src/genomeCoverageBed/genomeCoverageBed.cpp
@@ -12,10 +12,9 @@
 #include "lineFileUtilities.h"
 #include "genomeCoverageBed.h"
 
-/*
-	Constructor
-*/
-BedCoverage::BedCoverage(string &bedFile, string &genomeFile, bool &eachBase, bool &startSites, bool &bedGraph, int &max) {
+
+BedGenomeCoverage::BedGenomeCoverage(string &bedFile, string &genomeFile, bool &eachBase, 
+	                                 bool &startSites, bool &bedGraph, int &max, bool &bamInput) {
 
 	this->bedFile    = bedFile;
 	this->genomeFile = genomeFile;
@@ -23,34 +22,18 @@ BedCoverage::BedCoverage(string &bedFile, string &genomeFile, bool &eachBase, bo
 	this->startSites = startSites;
 	this->bedGraph   = bedGraph;
 	this->max        = max;
-
+	this->bamInput   = bamInput;
+	
 	this->bed        = new BedFile(bedFile);
 }
 
 
-
-
-/*
-	Destructor
-*/
-BedCoverage::~BedCoverage(void) {
-
+BedGenomeCoverage::~BedGenomeCoverage(void) {
+	delete this->bed;
 }
 
 
-
-
-/*
-	_Method: CoverageBeds
-	_Purpose:
-	_Example:
-		           Chrom: ======================================
-		 		   Reads:    -----	 -----
-									-----		----
-				   Depth: 00011112111111111001111000000000000000
-	_Gotchas:
-*/
-void BedCoverage::CoverageBeds(istream &bedInput) {
+void BedGenomeCoverage::CoverageBed(istream &bedInput) {
 
 	// Variables
 	map<string, int, less<string> > chromSizes;
@@ -66,10 +49,21 @@ void BedCoverage::CoverageBeds(istream &bedInput) {
 		exit (1);
 	}
 	else {
-		string chrom;
-		unsigned int size;
-		while (genome >> chrom >> size) {
-			chromSizes[chrom] = size;
+		//string chrom;
+		//unsigned int size;
+		//while (genome >> chrom >> size) {
+		//	chromSizes[chrom] = size;
+		//}
+		// Tweak from Gordon Assaf to properly ignore invalid lines.
+		string genomeLine;
+		while (getline(genome, genomeLine)) {
+			string chrom;
+			unsigned int size;
+			istringstream iss(genomeLine);
+			iss >> chrom >> size;
+			if (iss) {
+				chromSizes[chrom] = size;
+			}
 		}
 	}
 
@@ -92,8 +86,8 @@ void BedCoverage::CoverageBeds(istream &bedInput) {
 		if (bed->parseLine(bedEntry, bedFields, lineNum)) {
 						
 			currChrom = bedEntry.chrom;
-			start = bedEntry.start;
-			end = bedEntry.end -1;
+			start     = bedEntry.start;
+			end       = bedEntry.end - 1;
 			
 			if (currChrom != prevChrom)  {
 				// If we've moved beyond the first encountered chromosomes,
@@ -104,7 +98,7 @@ void BedCoverage::CoverageBeds(istream &bedInput) {
 				
 				// empty the previous chromosome and reserve new
 				// space for the current one
-				std::vector< DEPTH >().swap(chromCov);
+				std::vector<DEPTH>().swap(chromCov);
 				chromSize = chromSizes[currChrom];
 				chromCov.resize(chromSize);
 
@@ -146,8 +140,117 @@ void BedCoverage::CoverageBeds(istream &bedInput) {
 }
 
 
+void BedGenomeCoverage::CoverageBam(string bamFile) {
 
-void BedCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, int &chromSize, string &chrom, chromHistMap &chromDepthHist) {
+	// Variables
+	map<string, int, less<string> > chromSizes;
+	chromHistMap chromDepthHist;
+
+	// open the GENOME file for reading.
+	// if successful, load each chrom and it's size into
+	// the "chromSize" map.  also compute the total size of the genom
+	// and store in "genomeSize".
+	ifstream genome(this->genomeFile.c_str(), ios::in);
+	if ( !genome ) {
+		cerr << "Error: The requested genome file (" <<this->genomeFile << ") could not be opened. Exiting!" << endl;
+		exit (1);
+	}
+	else {
+		//string chrom;
+		//unsigned int size;
+		//while (genome >> chrom >> size) {
+		//	chromSizes[chrom] = size;
+		//}
+		// Tweak from Gordon Assaf to properly ignore invalid lines.
+		string genomeLine;
+		while (getline(genome,genomeLine)) {
+			string chrom;
+			unsigned int size;
+			istringstream iss(genomeLine);
+			iss >> chrom >> size;
+			if (iss) {
+				chromSizes[chrom] = size;
+			}
+		}
+	}
+	
+	string prevChrom, currChrom;
+	vector<DEPTH> chromCov;
+	int chromSize = 0;
+	int start, end;
+	
+	// open the BAM file
+	BamReader reader;
+	reader.Open(bamFile);
+
+	// get header & reference information
+	string header = reader.GetHeaderText();
+	RefVector refs = reader.GetReferenceData();
+
+	// convert each aligned BAM entry to BED 
+	// and compute coverage on B
+	BamAlignment bam;	
+	while (reader.GetNextAlignment(bam)) {
+		
+		if (bam.IsMapped()) {
+			
+			currChrom  = refs.at(bam.RefID).RefName;
+			start      = bam.Position;
+			end        = bam.Position + bam.AlignedBases.size() - 1;
+			
+			if (currChrom != prevChrom)  {
+				// If we've moved beyond the first encountered chromosomes,
+				// process the results of the previous chromosome.
+				if (prevChrom.length() > 0) {
+					ReportChromCoverage(chromCov, chromSizes[prevChrom], prevChrom, chromDepthHist);
+				}
+				
+				// empty the previous chromosome and reserve new
+				// space for the current one
+				std::vector<DEPTH>().swap(chromCov);
+				chromSize = chromSizes[currChrom];
+				chromCov.resize(chromSize);
+
+				// process the first line for this chromosome.
+				// make sure the coordinates fit within the chrom
+				if (start < chromSize) {
+					chromCov[start].starts++;
+				}
+				if (end < chromSize) {
+					chromCov[end].ends++;
+				}
+				else {
+					chromCov[chromSize-1].ends++;
+				}
+			}
+			else {
+				// process the other lines for this chromosome.
+				// make sure the coordinates fit within the chrom
+				if (start < chromSize) {
+					chromCov[start].starts++;
+				}
+				if (end < chromSize) {
+					chromCov[end].ends++;
+				}
+				else {
+					chromCov[chromSize-1].ends++;
+				}			
+			}
+			prevChrom = currChrom;
+		}
+	}
+	// process the results of the last chromosome.
+	ReportChromCoverage(chromCov, chromSizes[currChrom], currChrom, chromDepthHist);
+	
+	if (this->eachBase == false && this->bedGraph == false) {
+		ReportGenomeCoverage(chromSizes, chromDepthHist);
+	}
+	
+	reader.Close();
+}
+
+
+void BedGenomeCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, int &chromSize, string &chrom, chromHistMap &chromDepthHist) {
 	
 	if (this->eachBase) {
 		int depth = 0;  // initialize the depth
@@ -195,14 +298,13 @@ void BedCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, int &chromS
 
 
 
-void BedCoverage::ReportGenomeCoverage(map<string, int> &chromSizes, chromHistMap &chromDepthHist) {
+void BedGenomeCoverage::ReportGenomeCoverage(map<string, int> &chromSizes, chromHistMap &chromDepthHist) {
 	
 	unsigned int genomeSize = 0;
 	for (map<string, int, less<string> >::iterator m = chromSizes.begin(); m != chromSizes.end(); ++m) {
 		
 		string chrom = m->first;
-		genomeSize += chromSizes[chrom];
-		
+		genomeSize   += chromSizes[chrom];
 		// if there were no reads for a give chromosome, then
 		// add the length of the chrom to the 0 bin.
 		if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) {
@@ -236,7 +338,7 @@ void BedCoverage::ReportGenomeCoverage(map<string, int> &chromSizes, chromHistMa
 }
 
 
-void BedCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, int &chromSize, string &chrom) {
+void BedGenomeCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, int &chromSize, string &chrom) {
 
 	int depth     = 0;     // initialize the depth
 	int lastStart = -1 ;
@@ -268,7 +370,6 @@ void BedCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, int
 		depth = depth - chromCov[pos].ends;
 	}
 	
-	
 	//Print information about the last position
 	if (lastDepth != -1) {
 		cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth << endl;
@@ -276,16 +377,23 @@ void BedCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, int
 }
 
 
-void BedCoverage::DetermineBedInput() {
+void BedGenomeCoverage::DetermineBedInput() {
 	if (bed->bedFile != "stdin") {   // process a file
-		ifstream beds(bed->bedFile.c_str(), ios::in);
-		if ( !beds ) {
-			cerr << "Error: The requested bed file (" << bed->bedFile << ") could not be opened. Exiting!" << endl;
-			exit (1);
+		if (this->bamInput == false) { //bed/gff
+			ifstream beds(bed->bedFile.c_str(), ios::in);
+			if ( !beds ) {
+				cerr << "Error: The requested bed file (" << bed->bedFile << ") could not be opened. Exiting!" << endl;
+				exit (1);
+			}
+			CoverageBed(beds);
 		}
-		CoverageBeds(beds);
+		else 
+			CoverageBam(bed->bedFile);
 	}
-	else {   						// process stdin
-		CoverageBeds(cin);		
+	else {   // process stdin
+		if (this->bamInput == false) 
+			CoverageBed(cin);
+		else 
+			CoverageBam("stdin");	
 	}
-}
+}
\ No newline at end of file
diff --git a/src/genomeCoverageBed/genomeCoverageBed.h b/src/genomeCoverageBed/genomeCoverageBed.h
index 14fcf868..9a0bccda 100755
--- a/src/genomeCoverageBed/genomeCoverageBed.h
+++ b/src/genomeCoverageBed/genomeCoverageBed.h
@@ -10,36 +10,43 @@ aaronquinlan@gmail.com
 Licenced under the GNU General Public License 2.0+ license.
 ******************************************************************************/
 #include "bedFile.h"
+
+#include "BamReader.h"
+#include "BamAux.h"
+using namespace BamTools;
+
 #include <vector>
 #include <iostream>
 #include <fstream>
-
 using namespace std;
 
 
 //***********************************************
 // Typedefs
 //***********************************************
-typedef map<int, DEPTH, less<int> > depthMap;
+typedef map<int, DEPTH, less<int> >          depthMap;
 typedef map<string, depthMap, less<string> > chromDepthMap;
 
-typedef map<int, unsigned int, less<int> > histMap;
-typedef map<string, histMap, less<string> > chromHistMap;
+typedef map<int, unsigned int, less<int> >   histMap;
+typedef map<string, histMap, less<string> >  chromHistMap;
 
 //************************************************
 // Class methods and elements
 //************************************************
-class BedCoverage {
+class BedGenomeCoverage {
 
 public:
 
 	// constructor 
-	BedCoverage(string &bedFile, string &genomeFile, bool &eachBase, bool &startSites, bool &bedGraph, int &max);
+	BedGenomeCoverage(string &bedFile, string &genomeFile, bool &eachBase, bool &startSites, 
+		bool &bedGraph, int &max, bool &bamInput);
 
 	// destructor
-	~BedCoverage(void);
+	~BedGenomeCoverage(void);
+
+	void CoverageBed(istream &bedInput);
 
-	void CoverageBeds(istream &bedInput);
+	void CoverageBam(string bamFile);
 
 	void ReportChromCoverage(const vector<DEPTH> &, int &chromSize, string &chrom, chromHistMap&);
 
@@ -53,6 +60,7 @@ private:
 
 	string bedFile;
 	string genomeFile;
+	bool bamInput;
 	bool eachBase;
 	bool startSites;
 	bool bedGraph;
diff --git a/src/genomeCoverageBed/genomeCoverageMain.cpp b/src/genomeCoverageBed/genomeCoverageMain.cpp
index 4efb107f..30f4afbf 100755
--- a/src/genomeCoverageBed/genomeCoverageMain.cpp
+++ b/src/genomeCoverageBed/genomeCoverageMain.cpp
@@ -35,6 +35,7 @@ int main(int argc, char* argv[]) {
 	int max = 999999999;
 	
 	bool haveBed    = false;
+	bool bamInput   = false;
 	bool haveGenome = false;
 	bool startSites = false;
 	bool bedGraph   = false;
@@ -66,6 +67,14 @@ int main(int argc, char* argv[]) {
 				i++;
 			}
 		}
+		if(PARAMETER_CHECK("-ibam", 5, parameterLength)) {
+			if ((i+1) < argc) {
+				haveBed  = true;
+				bamInput = true;
+				bedFile  = argv[i + 1];
+				i++;		
+			}
+		}
 		else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
 			if ((i+1) < argc) {
 				haveGenome = true;
@@ -103,7 +112,9 @@ int main(int argc, char* argv[]) {
 	
 	if (!showHelp) {
 		
-		BedCoverage *bc = new BedCoverage(bedFile, genomeFile, eachBase, startSites, bedGraph, max);
+		BedGenomeCoverage *bc = new BedGenomeCoverage(bedFile, genomeFile, eachBase, 
+                                                      startSites, bedGraph, max, bamInput);
+		
 		bc->DetermineBedInput();
 		
 		return 0;
@@ -124,6 +135,10 @@ void ShowHelp(void) {
 	cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -i <bed> -g <genome>" << endl << endl;
 	
 	cerr << "Options: " << endl;
+	
+	cerr << "\t-ibam\t"			<< "The input file is in BAM format." << endl;
+	cerr                        << "\t\tNote: BAM _must_ be sorted by position" << endl << endl;	
+	
 	cerr << "\t-d\t"	     	<< "Report the depth at each genome position." << endl;
 	cerr 						<< "\t\tDefault behavior is to report a histogram." << endl << endl;
 
@@ -144,9 +159,11 @@ void ShowHelp(void) {
 	cerr << "\t..." << endl;
 	cerr << "\tchr18_gl000207_random\t4262" << endl << endl;
 
-	cerr << "\t(2)  NOTE: The input BED file must be grouped by chromosome." << endl;
+	cerr << "\t(2)  The input BED (-i) file must be grouped by chromosome." << endl;
 	cerr << "\t     A simple \"sort -k 1,1 <BED> > <BED>.sorted\" will suffice."<< endl << endl;
 
+	cerr << "\t(3)  The input BAM (-ibam) file must be sorted by position." << endl;
+	cerr << "\t     A \"samtools sort <BAM>\" should suffice."<< endl << endl;
 	
 	cerr << "Tips: " << endl;
 	cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl;
-- 
GitLab