From 2fea1f04605a8dd6a4c3e07c8eb9d98c9d10123a Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Mon, 22 Mar 2010 21:56:05 -0400 Subject: [PATCH] Added native BAM support to genomeCoverageBed. --- src/genomeCoverageBed/genomeCoverageBed.cpp | 202 ++++++++++++++----- src/genomeCoverageBed/genomeCoverageBed.h | 24 ++- src/genomeCoverageBed/genomeCoverageMain.cpp | 21 +- 3 files changed, 190 insertions(+), 57 deletions(-) diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index 80191b1b..7098c376 100755 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -12,10 +12,9 @@ #include "lineFileUtilities.h" #include "genomeCoverageBed.h" -/* - Constructor -*/ -BedCoverage::BedCoverage(string &bedFile, string &genomeFile, bool &eachBase, bool &startSites, bool &bedGraph, int &max) { + +BedGenomeCoverage::BedGenomeCoverage(string &bedFile, string &genomeFile, bool &eachBase, + bool &startSites, bool &bedGraph, int &max, bool &bamInput) { this->bedFile = bedFile; this->genomeFile = genomeFile; @@ -23,34 +22,18 @@ BedCoverage::BedCoverage(string &bedFile, string &genomeFile, bool &eachBase, bo this->startSites = startSites; this->bedGraph = bedGraph; this->max = max; - + this->bamInput = bamInput; + this->bed = new BedFile(bedFile); } - - -/* - Destructor -*/ -BedCoverage::~BedCoverage(void) { - +BedGenomeCoverage::~BedGenomeCoverage(void) { + delete this->bed; } - - -/* - _Method: CoverageBeds - _Purpose: - _Example: - Chrom: ====================================== - Reads: ----- ----- - ----- ---- - Depth: 00011112111111111001111000000000000000 - _Gotchas: -*/ -void BedCoverage::CoverageBeds(istream &bedInput) { +void BedGenomeCoverage::CoverageBed(istream &bedInput) { // Variables map<string, int, less<string> > chromSizes; @@ -66,10 +49,21 @@ void BedCoverage::CoverageBeds(istream &bedInput) { exit (1); } else { - string chrom; - unsigned int size; - while (genome >> chrom >> size) { - chromSizes[chrom] = size; + //string chrom; + //unsigned int size; + //while (genome >> chrom >> size) { + // chromSizes[chrom] = size; + //} + // Tweak from Gordon Assaf to properly ignore invalid lines. + string genomeLine; + while (getline(genome, genomeLine)) { + string chrom; + unsigned int size; + istringstream iss(genomeLine); + iss >> chrom >> size; + if (iss) { + chromSizes[chrom] = size; + } } } @@ -92,8 +86,8 @@ void BedCoverage::CoverageBeds(istream &bedInput) { if (bed->parseLine(bedEntry, bedFields, lineNum)) { currChrom = bedEntry.chrom; - start = bedEntry.start; - end = bedEntry.end -1; + start = bedEntry.start; + end = bedEntry.end - 1; if (currChrom != prevChrom) { // If we've moved beyond the first encountered chromosomes, @@ -104,7 +98,7 @@ void BedCoverage::CoverageBeds(istream &bedInput) { // empty the previous chromosome and reserve new // space for the current one - std::vector< DEPTH >().swap(chromCov); + std::vector<DEPTH>().swap(chromCov); chromSize = chromSizes[currChrom]; chromCov.resize(chromSize); @@ -146,8 +140,117 @@ void BedCoverage::CoverageBeds(istream &bedInput) { } +void BedGenomeCoverage::CoverageBam(string bamFile) { -void BedCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, int &chromSize, string &chrom, chromHistMap &chromDepthHist) { + // Variables + map<string, int, less<string> > chromSizes; + chromHistMap chromDepthHist; + + // open the GENOME file for reading. + // if successful, load each chrom and it's size into + // the "chromSize" map. also compute the total size of the genom + // and store in "genomeSize". + ifstream genome(this->genomeFile.c_str(), ios::in); + if ( !genome ) { + cerr << "Error: The requested genome file (" <<this->genomeFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + //string chrom; + //unsigned int size; + //while (genome >> chrom >> size) { + // chromSizes[chrom] = size; + //} + // Tweak from Gordon Assaf to properly ignore invalid lines. + string genomeLine; + while (getline(genome,genomeLine)) { + string chrom; + unsigned int size; + istringstream iss(genomeLine); + iss >> chrom >> size; + if (iss) { + chromSizes[chrom] = size; + } + } + } + + string prevChrom, currChrom; + vector<DEPTH> chromCov; + int chromSize = 0; + int start, end; + + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // convert each aligned BAM entry to BED + // and compute coverage on B + BamAlignment bam; + while (reader.GetNextAlignment(bam)) { + + if (bam.IsMapped()) { + + currChrom = refs.at(bam.RefID).RefName; + start = bam.Position; + end = bam.Position + bam.AlignedBases.size() - 1; + + if (currChrom != prevChrom) { + // If we've moved beyond the first encountered chromosomes, + // process the results of the previous chromosome. + if (prevChrom.length() > 0) { + ReportChromCoverage(chromCov, chromSizes[prevChrom], prevChrom, chromDepthHist); + } + + // empty the previous chromosome and reserve new + // space for the current one + std::vector<DEPTH>().swap(chromCov); + chromSize = chromSizes[currChrom]; + chromCov.resize(chromSize); + + // process the first line for this chromosome. + // make sure the coordinates fit within the chrom + if (start < chromSize) { + chromCov[start].starts++; + } + if (end < chromSize) { + chromCov[end].ends++; + } + else { + chromCov[chromSize-1].ends++; + } + } + else { + // process the other lines for this chromosome. + // make sure the coordinates fit within the chrom + if (start < chromSize) { + chromCov[start].starts++; + } + if (end < chromSize) { + chromCov[end].ends++; + } + else { + chromCov[chromSize-1].ends++; + } + } + prevChrom = currChrom; + } + } + // process the results of the last chromosome. + ReportChromCoverage(chromCov, chromSizes[currChrom], currChrom, chromDepthHist); + + if (this->eachBase == false && this->bedGraph == false) { + ReportGenomeCoverage(chromSizes, chromDepthHist); + } + + reader.Close(); +} + + +void BedGenomeCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, int &chromSize, string &chrom, chromHistMap &chromDepthHist) { if (this->eachBase) { int depth = 0; // initialize the depth @@ -195,14 +298,13 @@ void BedCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, int &chromS -void BedCoverage::ReportGenomeCoverage(map<string, int> &chromSizes, chromHistMap &chromDepthHist) { +void BedGenomeCoverage::ReportGenomeCoverage(map<string, int> &chromSizes, chromHistMap &chromDepthHist) { unsigned int genomeSize = 0; for (map<string, int, less<string> >::iterator m = chromSizes.begin(); m != chromSizes.end(); ++m) { string chrom = m->first; - genomeSize += chromSizes[chrom]; - + genomeSize += chromSizes[chrom]; // if there were no reads for a give chromosome, then // add the length of the chrom to the 0 bin. if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) { @@ -236,7 +338,7 @@ void BedCoverage::ReportGenomeCoverage(map<string, int> &chromSizes, chromHistMa } -void BedCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, int &chromSize, string &chrom) { +void BedGenomeCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, int &chromSize, string &chrom) { int depth = 0; // initialize the depth int lastStart = -1 ; @@ -268,7 +370,6 @@ void BedCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, int depth = depth - chromCov[pos].ends; } - //Print information about the last position if (lastDepth != -1) { cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth << endl; @@ -276,16 +377,23 @@ void BedCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, int } -void BedCoverage::DetermineBedInput() { +void BedGenomeCoverage::DetermineBedInput() { if (bed->bedFile != "stdin") { // process a file - ifstream beds(bed->bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << bed->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); + if (this->bamInput == false) { //bed/gff + ifstream beds(bed->bedFile.c_str(), ios::in); + if ( !beds ) { + cerr << "Error: The requested bed file (" << bed->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + CoverageBed(beds); } - CoverageBeds(beds); + else + CoverageBam(bed->bedFile); } - else { // process stdin - CoverageBeds(cin); + else { // process stdin + if (this->bamInput == false) + CoverageBed(cin); + else + CoverageBam("stdin"); } -} +} \ No newline at end of file diff --git a/src/genomeCoverageBed/genomeCoverageBed.h b/src/genomeCoverageBed/genomeCoverageBed.h index 14fcf868..9a0bccda 100755 --- a/src/genomeCoverageBed/genomeCoverageBed.h +++ b/src/genomeCoverageBed/genomeCoverageBed.h @@ -10,36 +10,43 @@ aaronquinlan@gmail.com Licenced under the GNU General Public License 2.0+ license. ******************************************************************************/ #include "bedFile.h" + +#include "BamReader.h" +#include "BamAux.h" +using namespace BamTools; + #include <vector> #include <iostream> #include <fstream> - using namespace std; //*********************************************** // Typedefs //*********************************************** -typedef map<int, DEPTH, less<int> > depthMap; +typedef map<int, DEPTH, less<int> > depthMap; typedef map<string, depthMap, less<string> > chromDepthMap; -typedef map<int, unsigned int, less<int> > histMap; -typedef map<string, histMap, less<string> > chromHistMap; +typedef map<int, unsigned int, less<int> > histMap; +typedef map<string, histMap, less<string> > chromHistMap; //************************************************ // Class methods and elements //************************************************ -class BedCoverage { +class BedGenomeCoverage { public: // constructor - BedCoverage(string &bedFile, string &genomeFile, bool &eachBase, bool &startSites, bool &bedGraph, int &max); + BedGenomeCoverage(string &bedFile, string &genomeFile, bool &eachBase, bool &startSites, + bool &bedGraph, int &max, bool &bamInput); // destructor - ~BedCoverage(void); + ~BedGenomeCoverage(void); + + void CoverageBed(istream &bedInput); - void CoverageBeds(istream &bedInput); + void CoverageBam(string bamFile); void ReportChromCoverage(const vector<DEPTH> &, int &chromSize, string &chrom, chromHistMap&); @@ -53,6 +60,7 @@ private: string bedFile; string genomeFile; + bool bamInput; bool eachBase; bool startSites; bool bedGraph; diff --git a/src/genomeCoverageBed/genomeCoverageMain.cpp b/src/genomeCoverageBed/genomeCoverageMain.cpp index 4efb107f..30f4afbf 100755 --- a/src/genomeCoverageBed/genomeCoverageMain.cpp +++ b/src/genomeCoverageBed/genomeCoverageMain.cpp @@ -35,6 +35,7 @@ int main(int argc, char* argv[]) { int max = 999999999; bool haveBed = false; + bool bamInput = false; bool haveGenome = false; bool startSites = false; bool bedGraph = false; @@ -66,6 +67,14 @@ int main(int argc, char* argv[]) { i++; } } + if(PARAMETER_CHECK("-ibam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBed = true; + bamInput = true; + bedFile = argv[i + 1]; + i++; + } + } else if(PARAMETER_CHECK("-g", 2, parameterLength)) { if ((i+1) < argc) { haveGenome = true; @@ -103,7 +112,9 @@ int main(int argc, char* argv[]) { if (!showHelp) { - BedCoverage *bc = new BedCoverage(bedFile, genomeFile, eachBase, startSites, bedGraph, max); + BedGenomeCoverage *bc = new BedGenomeCoverage(bedFile, genomeFile, eachBase, + startSites, bedGraph, max, bamInput); + bc->DetermineBedInput(); return 0; @@ -124,6 +135,10 @@ void ShowHelp(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed> -g <genome>" << endl << endl; cerr << "Options: " << endl; + + cerr << "\t-ibam\t" << "The input file is in BAM format." << endl; + cerr << "\t\tNote: BAM _must_ be sorted by position" << endl << endl; + cerr << "\t-d\t" << "Report the depth at each genome position." << endl; cerr << "\t\tDefault behavior is to report a histogram." << endl << endl; @@ -144,9 +159,11 @@ void ShowHelp(void) { cerr << "\t..." << endl; cerr << "\tchr18_gl000207_random\t4262" << endl << endl; - cerr << "\t(2) NOTE: The input BED file must be grouped by chromosome." << endl; + cerr << "\t(2) The input BED (-i) file must be grouped by chromosome." << endl; cerr << "\t A simple \"sort -k 1,1 <BED> > <BED>.sorted\" will suffice."<< endl << endl; + cerr << "\t(3) The input BAM (-ibam) file must be sorted by position." << endl; + cerr << "\t A \"samtools sort <BAM>\" should suffice."<< endl << endl; cerr << "Tips: " << endl; cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; -- GitLab