From b3a33c722b364c010b0d6c5315d16ddfa5a471f3 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Mon, 26 Apr 2010 09:43:15 -0400 Subject: [PATCH] Fixed bugs in bamToBed. 1. Writing BEDPE now requires a BAM sorted/grouped by query. 2. BEDPE output respects CIGARs on both ends. 3. End coordinates for all output formats respect CIGAR. 4. Default score for BEDPE is min mapping quality. --- src/bamToBed/bamToBed.cpp | 229 ++++++++++++++++++++----------- src/utils/BamTools/BamAux.h | 5 +- src/utils/BamTools/BamWriter.cpp | 3 +- 3 files changed, 154 insertions(+), 83 deletions(-) diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index ba9770c7..3cc5b015 100755 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -31,11 +31,18 @@ using namespace std; // function declarations void ShowHelp(void); -void ParseCigarBed(const vector<CigarOp> cigar, int &end); -void ParseCigarBed2(const vector<CigarOp> cigar, vector<int> &blockStarts, vector<int> &blockEnds, int &end); -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, bool usePadded = false); + +void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, + const bool &writeBed12, const string &color); +void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance); + +void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance); void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, string color = "255,0,0"); -void PrintBedPE(const BamAlignment &bam, const RefVector &refs, bool useEditDistance); +void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam1, + const RefVector &refs, bool useEditDistance); + +void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, + vector<int> &blockEnds, int &alignmentEnd); bool IsCorrectMappingForBEDPE (const BamAlignment &bam); @@ -53,7 +60,6 @@ int main(int argc, char* argv[]) { bool writeBedPE = false; bool writeBed12 = false; bool useEditDistance = false; - bool usePadded = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -90,9 +96,6 @@ int main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { useEditDistance = true; } - else if(PARAMETER_CHECK("-padded", 7, parameterLength)) { - usePadded = true; - } else if(PARAMETER_CHECK("-color", 6, parameterLength)) { if ((i+1) < argc) { haveColor = true; @@ -111,48 +114,17 @@ int main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; showHelp = true; } - if (writeBedPE && useEditDistance) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use edit distance with BEDPE. " << endl << "*****" << endl; - showHelp = true; - } if (haveColor && writeBed12 == false) { cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl; showHelp = true; } + // if there are no problems, let's convert BAM to BED or BEDPE if (!showHelp) { - - // open the BAM file - BamReader reader; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - BamAlignment bam; - while (reader.GetNextAlignment(bam)) { - if (writeBedPE == false && writeBed12 == false) { - if (bam.IsMapped()) { - PrintBed(bam, refs, useEditDistance); - } - } - else if (writeBedPE == false && writeBed12 == true) { - if (bam.IsMapped()) { - PrintBed12(bam, refs, useEditDistance, color); - } - } - else { - if ( ! bam.IsPaired() ) { // ensure that the BAM file is paired. - cerr << "Encountered an unpaired alignment. Are you sure this is a paired BAM file? Exiting." << endl; - exit(1); - } - else if (IsCorrectMappingForBEDPE(bam)) { // only report one of the two mappings for a pair - PrintBedPE(bam, refs, useEditDistance); - } - } - } - reader.Close(); + if (writeBedPE == false) + ConvertBamToBed(bamFile, useEditDistance, writeBed12, color); // BED or "blocked BED" + else + ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE } else { ShowHelp(); @@ -172,13 +144,18 @@ void ShowHelp(void) { cerr << "Options: " << endl; - cerr << "\t-bedpe\t" << "Write BEDPE format." << endl << endl; + cerr << "\t-bedpe\t" << "Write BEDPE format." << endl; + cerr << "\t\t- Requires BAM to be grouped or sorted by query." << endl << endl; + cerr << "\t-bed12\t" << "Write \"blocked\" BED format (aka \"BED12\")." << endl << endl; cerr << "\t\thttp://genome-test.cse.ucsc.edu/FAQ/FAQformat#format1" << endl << endl; - cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for score." << endl; - cerr << "\t\tDefault is to use mapping quality." << endl; - cerr << "\t\tNot available for BEDPE format." << endl << endl; + cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for BED score." << endl; + cerr << "\t\t- Default for BED is to use mapping quality." << endl; + cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; + cerr << "\t\t of the two mapping qualities for the pair." << endl; + cerr << "\t\t- When -ed is used with -bedpe, the total edit" << endl; + cerr << "\t\t distance from the two mates is reported." << endl << endl; cerr << "\t-color\t" << "An R,G,B string for the color used with BED12 format." << endl; cerr << "\t\tDefault is (255,0,0)." << endl; @@ -189,7 +166,67 @@ void ShowHelp(void) { } -void ParseCigarBed12(const vector<CigarOp> cigar, vector<int> &blockStarts, vector<int> &blockLengths, unsigned int &alignmentEnd) { +void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, + const bool &writeBed12, const string &color) { + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // rip through the BAM file and convert each mapped entry to BED + BamAlignment bam; + while (reader.GetNextAlignment(bam)) { + if (bam.IsMapped() == true) { + if (writeBed12 == false) // BED + PrintBed(bam, refs, useEditDistance); + else //"blocked" BED + PrintBed12(bam, refs, useEditDistance, color); + } + } + reader.Close(); +} + + +/* + Assumptions: + 1. The BAM file is grouped/sorted by query name, + not alignment position +*/ +void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance) { + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // rip through the BAM file and convert each mapped entry to BEDPE + BamAlignment bam1, bam2; + while (reader.GetNextAlignment(bam1)) { + // the alignment must be paired + if (bam1.IsPaired() == true) { + // grab the second alignment for the pair. + reader.GetNextAlignment(bam2); + + // require that the alignments are from the same query + if (bam1.Name == bam2.Name) { + PrintBedPE(bam1, bam2, refs, useEditDistance); + } + else { + cerr << "*****ERROR: -bedpe requires BAM to be sorted/grouped by query name. " << endl; + exit(1); + } + } + } + reader.Close(); +} + + +void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vector<int> &blockLengths, unsigned int &alignmentEnd) { int currPosition = 0; int blockLength = 0; @@ -206,6 +243,8 @@ void ParseCigarBed12(const vector<CigarOp> cigar, vector<int> &blockStarts, vect case ('I') : break; case ('S') : break; case ('D') : break; + blockLength += cigItr->Length; + currPosition += cigItr->Length; case ('P') : break; case ('N') : blockStarts.push_back(currPosition + cigItr->Length); @@ -214,25 +253,30 @@ void ParseCigarBed12(const vector<CigarOp> cigar, vector<int> &blockStarts, vect blockLength = 0; case ('H') : break; // for 'H' - do nothing, move to next op default : - printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here + printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here exit(1); } } + // add the kast block and set the + // alignment end (i.e., relative to the start) blockLengths.push_back(blockLength); alignmentEnd = currPosition; } -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, bool usePadded) { +void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance) { - // set the name of the feature based on the sequence + // set the strand string strand = "+"; - if (bam.IsReverseStrand()) strand = "-"; + if (bam.IsReverseStrand() == true) strand = "-"; + + // set the name of the feature based on the sequence string name = bam.Name; - if (bam.IsFirstMate()) name += "/1"; - if (bam.IsSecondMate()) name += "/2"; + if (bam.IsFirstMate() == true) name += "/1"; + if (bam.IsSecondMate() == true) name += "/2"; - unsigned int alignmentEnd = bam.GetEndPosition(usePadded); + // get the unpadded (parm = false) end position based on the CIGAR + unsigned int alignmentEnd = bam.GetEndPosition(false); // report the alignment in BED6 format. if (useEditDistance == false) { @@ -255,10 +299,11 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, string color) { - // set the name of the feature based on the sequence + // set the strand string strand = "+"; if (bam.IsReverseStrand()) strand = "-"; + // set the name of the feature based on the sequence string name = bam.Name; if (bam.IsFirstMate()) name += "/1"; if (bam.IsSecondMate()) name += "/2"; @@ -269,10 +314,10 @@ void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDist vector<int> blockStarts; blockStarts.push_back(0); + // extract the block starts and lengths from the CIGAR string ParseCigarBed12(bam.CigarData, blockStarts, blockLengths, alignmentEnd); alignmentEnd += bam.Position; - // write BED6 portion if (useEditDistance == false) { printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, @@ -308,41 +353,67 @@ void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDist } -void PrintBedPE(const BamAlignment &bam, const RefVector &refs, bool useEditDistance) { +void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance) { + // initialize BEDPE variables string chrom1, chrom2, strand1, strand2; int start1, start2, end1, end2; + uint8_t editDistance1, editDistance2; start1 = start2 = end1 = end2 = -1; chrom1 = chrom2 = strand1 = strand2 = "."; + editDistance1 = editDistance2 = 0; - // end 1 - if (bam.IsMapped()) { - chrom1 = refs.at(bam.RefID).RefName; - start1 = bam.Position; - end1 = bam.Position + bam.AlignedBases.size(); // use the aligned length, - // not the orig. length + // compute the minimum mapping quality b/w the two ends of the pair. + uint16_t minMapQuality = min(bam1.MapQuality, bam2.MapQuality); + + // extract relevant info for end 1 + if (bam1.IsMapped()) { + chrom1 = refs.at(bam1.RefID).RefName; + start1 = bam1.Position; + end1 = bam1.GetEndPosition(false); strand1 = "+"; - if (bam.IsReverseStrand()) strand1 = "-"; + if (bam1.IsReverseStrand()) strand1 = "-"; + + // extract the edit distance from the NM tag + // if possible. otherwise, complain. + if (useEditDistance == true && bam1.GetEditDistance(editDistance1) == false) { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } } - // end 2 - if (bam.IsMateMapped()) { - chrom2 = refs.at(bam.MateRefID).RefName; - start2 = bam.MatePosition; - end2 = bam.MatePosition + bam.AlignedBases.size(); // use the aligned length, - // not the orig. length + // extract relevant info for end 2 + if (bam2.IsMapped()) { + chrom2 = refs.at(bam2.RefID).RefName; + start2 = bam2.Position; + end2 = bam2.GetEndPosition(false); strand2 = "+"; - if (bam.IsMateReverseStrand()) strand2 = "-"; + if (bam2.IsReverseStrand()) strand2 = "-"; + + // extract the edit distance from the NM tag + // if possible. otherwise, complain. + if (useEditDistance == true && bam2.GetEditDistance(editDistance2) == false) { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } } - printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", - chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, - bam.Name.c_str(), bam.MapQuality, strand1.c_str(), strand2.c_str()); - + // report BEDPE using min mapQuality + if (useEditDistance == false) { + printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", + chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, + bam1.Name.c_str(), minMapQuality, strand1.c_str(), strand2.c_str()); + } + // report BEDPE using total edit distance + else { + uint16_t totalEditDistance = editDistance1 + editDistance2; + printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", + chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, + bam1.Name.c_str(), totalEditDistance, strand1.c_str(), strand2.c_str()); + } } - - +// deprecated. bool IsCorrectMappingForBEDPE (const BamAlignment &bam) { if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { diff --git a/src/utils/BamTools/BamAux.h b/src/utils/BamTools/BamAux.h index 0c4caa59..b56c929e 100644 --- a/src/utils/BamTools/BamAux.h +++ b/src/utils/BamTools/BamAux.h @@ -111,7 +111,7 @@ struct BamAlignment { // Additional data access methods public: int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations - + // 'internal' utility methods private: static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); @@ -507,8 +507,7 @@ bool BamAlignment::AddBamTag(const std::string &tag, const std::string &valType, // :::Step 2::: // Add the requested tag. Note that a NULL terminator is needed for string tags (i.e. Z and H) std::string newTag = tag + valType + value + '\0'; - TagData += newTag; - + TagData.append(newTag); return true; } diff --git a/src/utils/BamTools/BamWriter.cpp b/src/utils/BamTools/BamWriter.cpp index 2cd2742d..b5294353 100644 --- a/src/utils/BamTools/BamWriter.cpp +++ b/src/utils/BamTools/BamWriter.cpp @@ -26,7 +26,7 @@ struct BamWriter::BamWriterPrivate { // constructor / destructor BamWriterPrivate(void) { - IsBigEndian = SystemIsBigEndian(); + IsBigEndian = SystemIsBigEndian(); } ~BamWriterPrivate(void) { @@ -241,6 +241,7 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam } } + // saves the alignment to the alignment archive void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { -- GitLab