From dc4984e6b639b01b9e6beb072a5fb9daa2fbe704 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Wed, 14 Apr 2010 21:37:41 -0400 Subject: [PATCH] Added new version of BamTools and fixed a couple minor bugs in intersectBed that were caused by the branchmerge. --- src/bamToBed/bamToBed.cpp | 2 +- src/intersectBed/intersectBed.cpp | 41 +-- src/utils/BamTools/BGZF.h | 12 +- src/utils/BamTools/BamAux.h | 583 +++++++++++++++++++----------- src/utils/BamTools/BamReader.cpp | 435 +++++++++++++--------- src/utils/BamTools/BamReader.h | 8 +- src/utils/BamTools/BamWriter.cpp | 154 ++++++-- 7 files changed, 774 insertions(+), 461 deletions(-) diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index d082f86e..0874510c 100755 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -246,7 +246,7 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista if (bam.IsSecondMate()) name += "/2"; // rip through the CIGAR string and reconstruct the alignment coordinates - unsigned int alignmentEnd = bam.GetAlignmentEnd(); + unsigned int alignmentEnd = bam.GetEndPosition(); //ParseCigarBed(bam.CigarData, alignmentEnd); //alignmentEnd += bam.Position; diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index eff1f6ce..1b8359bf 100755 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -172,34 +172,8 @@ bool BedIntersect::FindOneOrMoreOverlap(const BED &a) { void BedIntersect::IntersectBed() { -/* - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); - string bedLine; - int lineNum = 0; // current input line number - vector<BED> hits; // vector of potential hits - vector<string> bedFields; // vector for a BED entry - - // reserve some space - hits.reserve(100); - bedFields.reserve(12); - - // process each entry in A - while (getline(bedInput, bedLine)) { - - lineNum++; - Tokenize(bedLine,bedFields); - BED a; - // find the overlaps with B if it's a valid BED entry. - if (_bedA->parseLine(a, bedFields, lineNum)) { - FindOverlaps(a, hits); - hits.clear(); - } - // reset for the next input line - bedFields.clear(); - } - */ + // load the "B" file into a map in order to + // compare each entry in A to it in search of overlaps. _bedB->loadBedFileIntoMap(); int lineNum = 0; @@ -207,6 +181,7 @@ void BedIntersect::IntersectBed() { hits.reserve(100); BED a; + // open the "A" file, process each BED entry and searh for overlaps. _bedA->Open(); while (_bedA->GetNextBed(a, lineNum) == true) { FindOverlaps(a, hits); @@ -269,7 +244,7 @@ void BedIntersect::IntersectBam(string bamFile) { writer.SaveAlignment(bam); } else { - if (_noHit == false) + if (_noHit == true) writer.SaveAlignment(bam); } } @@ -294,13 +269,11 @@ void BedIntersect::DetermineBedInput() { if (_bedA->bedFile != "stdin") { // it's BED or GFF if (_bamInput == false) { - ifstream beds(_bedA->bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << _bedA->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } IntersectBed(); } + else { + IntersectBam(_bedA->bedFile); + } } // reading from stdin else { diff --git a/src/utils/BamTools/BGZF.h b/src/utils/BamTools/BGZF.h index 894e460e..e1adf25d 100644 --- a/src/utils/BamTools/BGZF.h +++ b/src/utils/BamTools/BGZF.h @@ -1,5 +1,5 @@ // *************************************************************************** -// BGZF.h (c) 2009 Derek Barnett, Michael Strömberg +// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- @@ -130,7 +130,7 @@ bool BgzfData::CheckBlockHeader(char* header) { BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN ); } -// packs an unsigned integer into the specified buffer +// 'packs' an unsigned integer into the specified buffer inline void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) { buffer[0] = (char)value; @@ -139,14 +139,14 @@ void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) { buffer[3] = (char)(value >> 24); } -// packs an unsigned short into the specified buffer +// 'packs' an unsigned short into the specified buffer inline void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) { buffer[0] = (char)value; buffer[1] = (char)(value >> 8); } -// unpacks a buffer into a signed int +// 'unpacks' a buffer into a signed int inline signed int BgzfData::UnpackSignedInt(char* buffer) { union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; @@ -158,7 +158,7 @@ signed int BgzfData::UnpackSignedInt(char* buffer) { return un.value; } -// unpacks a buffer into an unsigned int +// 'unpacks' a buffer into an unsigned int inline unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; @@ -170,7 +170,7 @@ unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { return un.value; } -// unpacks a buffer into an unsigned short +// 'unpacks' a buffer into an unsigned short inline unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; diff --git a/src/utils/BamTools/BamAux.h b/src/utils/BamTools/BamAux.h index 97342bf9..3623f1c2 100644 --- a/src/utils/BamTools/BamAux.h +++ b/src/utils/BamTools/BamAux.h @@ -1,9 +1,9 @@ // *************************************************************************** -// BamAux.h (c) 2009 Derek Barnett, Michael Strömberg +// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 Janaury 2010 (DB) +// Last modified: 14 April 2010 (DB) // --------------------------------------------------------------------------- // Provides the basic constants, data structures, etc. for using BAM files // *************************************************************************** @@ -26,18 +26,18 @@ // Platform-specific type definitions #ifndef BAMTOOLS_TYPES #define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include <stdint.h> - #endif + #ifdef _MSC_VER + typedef char int8_t; + typedef unsigned char uint8_t; + typedef short int16_t; + typedef unsigned short uint16_t; + typedef int int32_t; + typedef unsigned int uint32_t; + typedef long long int64_t; + typedef unsigned long long uint64_t; + #else + #include <stdint.h> + #endif #endif // BAMTOOLS_TYPES namespace BamTools { @@ -62,198 +62,56 @@ const int BAM_LIDX_SHIFT = 14; // Explicit variable sizes const int BT_SIZEOF_INT = 4; -// ARQ: moved this here. -struct CigarOp { - char Type; // Operation type (MIDNSHP) - uint32_t Length; // Operation length (number of bases) -}; - +struct CigarOp; struct BamAlignment { - // Queries against alignment flag + // constructors & destructor public: - // Returns true if this read is a PCR duplicate (determined by external app) - bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } - // Returns true if this read failed quality control (determined by external app) - bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } - // Returns true if alignment is first mate on read - bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } - // Returns true if alignment is mapped - bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } - // Returns true if alignment's mate is mapped - bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } - // Returns true if alignment's mate mapped to reverse strand - bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } - // Returns true if alignment part of paired-end read - bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } - // Returns true if this position is primary alignment (determined by external app) - bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } - // Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app) - bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } - // Returns true if alignment mapped to reverse strand - bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } - // Returns true if alignment is second mate on read - bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - - // Manipulate alignment flag - public: - // Sets "PCR duplicate" bit - void SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } - // Sets "failed quality control" bit - void SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } - // Sets "alignment is first mate" bit - void SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } - // Sets "alignment's mate is mapped" bit - void SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } - // Sets "alignment's mate mapped to reverse strand" bit - void SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } - // Sets "alignment part of paired-end read" bit - void SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } - // Sets "alignment is part of read that satisfied paired-end resolution" bit - void SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } - // Sets "alignment mapped to reverse strand" bit - void SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } - // Sets "position is primary alignment (determined by external app)" - void SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } - // Sets "alignment is second mate on read" bit - void SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } - // Sets "alignment is mapped" bit - void SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } - + BamAlignment(void); + BamAlignment(const BamAlignment& other); + ~BamAlignment(void); + + // Queries against alignment flags + public: + bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate + bool IsFailedQC(void) const; // Returns true if this read failed quality control + bool IsFirstMate(void) const; // Returns true if alignment is first mate on read + bool IsMapped(void) const; // Returns true if alignment is mapped + bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped + bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand + bool IsPaired(void) const; // Returns true if alignment part of paired-end read + bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment + bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution + bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand + bool IsSecondMate(void) const; // Returns true if alignment is second mate on read + + // Manipulate alignment flags + public: + void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag + void SetIsFailedQC(bool ok); // Sets "failed quality control" flag + void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag + void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag + void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag + void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag + void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag + void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag + void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag + void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag + void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag + + // Tag data access methods public: + bool GetEditDistance(uint8_t& editDistance) const; // get "NM" tag data - contributed by Aaron Quinlan + bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data - // get "RG" tag data - bool GetReadGroup(std::string& readGroup) const { - - if ( TagData.empty() ) { return false; } - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); - unsigned int numBytesParsed = 0; - - bool foundReadGroupTag = false; - while( numBytesParsed < tagDataLen ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag - if ( std::strncmp(pTagType, "RG", 2) == 0 ) { - foundReadGroupTag = true; - break; - } - - // get the storage class and find the next tag - SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed ); - } - - // return if the read group tag was not present - if ( !foundReadGroupTag ) { return false; } - - // assign the read group - const unsigned int readGroupLen = std::strlen(pTagData); - readGroup.resize(readGroupLen); - std::memcpy( (char*)readGroup.data(), pTagData, readGroupLen ); - return true; - } - - // get "NM" tag data - contributed by Aaron Quinlan - bool GetEditDistance(uint8_t& editDistance) const { - - if ( TagData.empty() ) { return false; } - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); - unsigned int numBytesParsed = 0; - - bool foundEditDistanceTag = false; - while( numBytesParsed < tagDataLen ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - // check the current tag - if ( strncmp(pTagType, "NM", 2) == 0 ) { - foundEditDistanceTag = true; - break; - } - - // get the storage class and find the next tag - SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed ); - } - // return if the edit distance tag was not present - if ( !foundEditDistanceTag ) { return false; } - - // assign the editDistance value - memcpy(&editDistance, pTagData, 1); - return true; - } - - // ARQ: - // compute and retuen the alignment end coordinate based on the CIGAR string. - int GetAlignmentEnd(bool usePadded = false) const { - // initialize alignment end to starting position - int alignEnd = Position; - - // iterate over cigar operations - std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); - std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { - alignEnd += (*cigarIter).Length; - } - else if (cigarType == 'I' && usePadded == true) { - alignEnd += (*cigarIter).Length; - } - } - return alignEnd; - } + // Additional data access methods + public: + int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations + // 'internal' utility methods private: - static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - switch(storageType) { - - case 'A': - case 'c': - case 'C': - ++numBytesParsed; - ++pTagData; - break; - - case 's': - case 'S': - case 'f': - numBytesParsed += 2; - pTagData += 2; - break; - - case 'i': - case 'I': - numBytesParsed += 4; - pTagData += 4; - break; - - case 'Z': - case 'H': - while(*pTagData) { - ++numBytesParsed; - ++pTagData; - } - ++pTagData; - break; - - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); - exit(1); - } - } + static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); // Data members public: @@ -274,48 +132,60 @@ struct BamAlignment { int32_t InsertSize; // Mate-pair insert size // Alignment flag query constants + // Use the get/set methods above instead private: - enum { PAIRED = 1, - PROPER_PAIR = 2, - UNMAPPED = 4, - MATE_UNMAPPED = 8, - REVERSE = 16, - MATE_REVERSE = 32, - READ_1 = 64, - READ_2 = 128, - SECONDARY = 256, - QC_FAILED = 512, - DUPLICATE = 1024 + enum { PAIRED = 1 + , PROPER_PAIR = 2 + , UNMAPPED = 4 + , MATE_UNMAPPED = 8 + , REVERSE = 16 + , MATE_REVERSE = 32 + , READ_1 = 64 + , READ_2 = 128 + , SECONDARY = 256 + , QC_FAILED = 512 + , DUPLICATE = 1024 }; }; // ---------------------------------------------------------------- // Auxiliary data structs & typedefs +struct CigarOp { + char Type; // Operation type (MIDNSHP) + uint32_t Length; // Operation length (number of bases) +}; + struct RefData { + // data members std::string RefName; // Name of reference sequence - int RefLength; // Length of reference sequence + int32_t RefLength; // Length of reference sequence bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence + // constructor - RefData(void) - : RefLength(0) - , RefHasAlignments(false) + RefData(const int32_t& length = 0, + bool ok = false) + : RefLength(length) + , RefHasAlignments(ok) { } }; -typedef std::vector<RefData> RefVector; +typedef std::vector<RefData> RefVector; typedef std::vector<BamAlignment> BamAlignmentVector; // ---------------------------------------------------------------- // Indexing structs & typedefs struct Chunk { + // data members uint64_t Start; uint64_t Stop; + // constructor - Chunk(const uint64_t& start = 0, const uint64_t& stop = 0) + Chunk(const uint64_t& start = 0, + const uint64_t& stop = 0) : Start(start) , Stop(stop) { } @@ -344,6 +214,289 @@ struct ReferenceIndex { typedef std::vector<ReferenceIndex> BamIndex; +// ---------------------------------------------------------------- +// BamAlignment member methods + +// constructors & destructor +inline +BamAlignment::BamAlignment(void) { } + +inline +BamAlignment::BamAlignment(const BamAlignment& other) + : Name(other.Name) + , Length(other.Length) + , QueryBases(other.QueryBases) + , AlignedBases(other.AlignedBases) + , Qualities(other.Qualities) + , TagData(other.TagData) + , RefID(other.RefID) + , Position(other.Position) + , Bin(other.Bin) + , MapQuality(other.MapQuality) + , AlignmentFlag(other.AlignmentFlag) + , CigarData(other.CigarData) + , MateRefID(other.MateRefID) + , MatePosition(other.MatePosition) + , InsertSize(other.InsertSize) +{ } + +inline +BamAlignment::~BamAlignment(void) { } + +// Queries against alignment flags +inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } +inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } +inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } +inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } +inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } +inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } +inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } +inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } +inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } +inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } +inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } + +// Manipulate alignment flags +inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } +inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } +inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } +inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } +inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } +inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } +inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } +inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } +inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } +inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } +inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } + +// calculates alignment end position, based on starting position and CIGAR operations +inline +int BamAlignment::GetEndPosition(bool usePadded) const { + + // initialize alignment end to starting position + int alignEnd = Position; + + // iterate over cigar operations + std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); + std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const char cigarType = (*cigarIter).Type; + if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { + alignEnd += (*cigarIter).Length; + } + else if ( usePadded && cigarType == 'I' ) { + alignEnd += (*cigarIter).Length; + } + } + return alignEnd; +} + +// get "NM" tag data - contributed by Aaron Quinlan +// stores data in 'editDistance', returns success/fail +inline +bool BamAlignment::GetEditDistance(uint8_t& editDistance) const { + + if ( TagData.empty() ) { return false; } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLen = TagData.size(); + unsigned int numBytesParsed = 0; + + bool foundEditDistanceTag = false; + while( numBytesParsed < tagDataLen ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag + if ( strncmp(pTagType, "NM", 2) == 0 ) { + foundEditDistanceTag = true; + break; + } + + // get the storage class and find the next tag + if (*pTagStorageType == '\0') { return false; } + SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed ); + if (*pTagData == '\0') { return false; } + } + // return if the edit distance tag was not present + if ( !foundEditDistanceTag ) { return false; } + + // assign the editDistance value + std::memcpy(&editDistance, pTagData, 1); + return true; +} + +// get "RG" tag data +// stores data in 'readGroup', returns success/fail +inline +bool BamAlignment::GetReadGroup(std::string& readGroup) const { + + if ( TagData.empty() ) { return false; } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLen = TagData.size(); + unsigned int numBytesParsed = 0; + + bool foundReadGroupTag = false; + while( numBytesParsed < tagDataLen ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag + if ( std::strncmp(pTagType, "RG", 2) == 0 ) { + foundReadGroupTag = true; + break; + } + + // get the storage class and find the next tag + if (*pTagStorageType == '\0') { return false; } + SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed ); + if (*pTagData == '\0') { return false; } + } + + // return if the read group tag was not present + if ( !foundReadGroupTag ) { return false; } + + // assign the read group + const unsigned int readGroupLen = std::strlen(pTagData); + readGroup.resize(readGroupLen); + std::memcpy( (char*)readGroup.data(), pTagData, readGroupLen ); + return true; +} + +inline +void BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { + + switch(storageType) { + + case 'A': + case 'c': + case 'C': + ++numBytesParsed; + ++pTagData; + break; + + case 's': + case 'S': + numBytesParsed += 2; + pTagData += 2; + break; + + case 'f': + case 'i': + case 'I': + numBytesParsed += 4; + pTagData += 4; + break; + + case 'Z': + case 'H': + while(*pTagData) { + ++numBytesParsed; + ++pTagData; + } + // --------------------------- + // Added: 3-25-2010 DWB + // Contributed: ARQ + // Fixed: error parsing variable length tag data + ++pTagData; + // --------------------------- + break; + + default: + printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); + exit(1); + } +} + +// ---------------------------------------------------------------- +// Added: 3-35-2010 DWB +// Fixed: Routines to provide endian-correctness +// ---------------------------------------------------------------- + +// returns true if system is big endian +inline bool SystemIsBigEndian(void) { + const uint16_t one = 0x0001; + return ((*(char*) &one) == 0 ); +} + +// swaps endianness of 16-bit value 'in place' +inline void SwapEndian_16(int16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +inline void SwapEndian_16(uint16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +// swaps endianness of 32-bit value 'in-place' +inline void SwapEndian_32(int32_t& x) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +inline void SwapEndian_32(uint32_t& x) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +// swaps endianness of 64-bit value 'in-place' +inline void SwapEndian_64(int64_t& x) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000ll) | + ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | + ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | + ((x >> 40) & 0x000000000000FF00ll) | + (x << 56) + ); +} + +inline void SwapEndian_64(uint64_t& x) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000ll) | + ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | + ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | + ((x >> 40) & 0x000000000000FF00ll) | + (x << 56) + ); +} + +// swaps endianness of 'next 2 bytes' in a char buffer (in-place) +inline void SwapEndian_16p(char* data) { + uint16_t& value = (uint16_t&)*data; + SwapEndian_16(value); +} + +// swaps endianness of 'next 4 bytes' in a char buffer (in-place) +inline void SwapEndian_32p(char* data) { + uint32_t& value = (uint32_t&)*data; + SwapEndian_32(value); +} + +// swaps endianness of 'next 8 bytes' in a char buffer (in-place) +inline void SwapEndian_64p(char* data) { + uint64_t& value = (uint64_t&)*data; + SwapEndian_64(value); +} + } // namespace BamTools #endif // BAMAUX_H diff --git a/src/utils/BamTools/BamReader.cpp b/src/utils/BamTools/BamReader.cpp index ab8d044f..a2f975f9 100644 --- a/src/utils/BamTools/BamReader.cpp +++ b/src/utils/BamTools/BamReader.cpp @@ -1,9 +1,9 @@ // *************************************************************************** -// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg +// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2010(DB) +// Last modified: 14 April 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -23,13 +23,23 @@ using namespace BamTools; using namespace std; +namespace BamTools { + struct BamAlignmentSupportData { + string AllCharData; + uint32_t BlockLength; + uint32_t NumCigarOperations; + uint32_t QueryNameLength; + uint32_t QuerySequenceLength; + }; +} // namespace BamTools + struct BamReader::BamReaderPrivate { // ------------------------------- // data members // ------------------------------- - // general data + // general file data BgzfData mBGZF; string HeaderText; BamIndex Index; @@ -38,6 +48,9 @@ struct BamReader::BamReaderPrivate { int64_t AlignmentsBeginOffset; string Filename; string IndexFilename; + + // system data + bool IsBigEndian; // user-specified region values bool IsRegionSpecified; @@ -68,10 +81,7 @@ struct BamReader::BamReaderPrivate { bool GetNextAlignment(BamAlignment& bAlignment); // access auxiliary data - const string GetHeaderText(void) const; - const int GetReferenceCount(void) const; - const RefVector GetReferenceData(void) const; - const int GetReferenceID(const string& refName) const; + int GetReferenceID(const string& refName) const; // index operations bool CreateIndex(void); @@ -84,8 +94,8 @@ struct BamReader::BamReaderPrivate { // calculate bins that overlap region ( left to reference end for now ) int BinsFromRegion(int refID, int left, uint16_t[MAX_BIN]); - // calculates alignment end position based on starting position and provided CIGAR operations - int CalculateAlignmentEnd(const int& position, const std::vector<CigarOp>& cigarData); + // fills out character data for BamAlignment data + bool BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData); // calculate file offset for first alignment chunk overlapping 'left' int64_t GetOffset(int refID, int left); // checks to see if alignment overlaps current region @@ -93,7 +103,7 @@ struct BamReader::BamReaderPrivate { // retrieves header text from BAM file void LoadHeaderData(void); // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); + bool LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData); // builds reference data structure from BAM file void LoadReferenceData(void); @@ -111,8 +121,6 @@ struct BamReader::BamReaderPrivate { bool LoadIndex(void); // simplifies index by merging 'chunks' void MergeChunks(void); - // round-up 32-bit integer to next power-of-2 - void Roundup32(int& value); // saves index to BAM index file bool WriteIndex(void); }; @@ -142,10 +150,10 @@ bool BamReader::Rewind(void) { return d->Rewind(); } bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); } // access auxiliary data -const string BamReader::GetHeaderText(void) const { return d->HeaderText; } -const int BamReader::GetReferenceCount(void) const { return d->References.size(); } +const string BamReader::GetHeaderText(void) const { return d->HeaderText; } +int BamReader::GetReferenceCount(void) const { return d->References.size(); } const RefVector BamReader::GetReferenceData(void) const { return d->References; } -const int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } +int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } // index operations bool BamReader::CreateIndex(void) { return d->CreateIndex(); } @@ -163,7 +171,9 @@ BamReader::BamReaderPrivate::BamReaderPrivate(void) , CurrentLeft(0) , DNA_LOOKUP("=ACMGRSVTWYHKDBN") , CIGAR_LOOKUP("MIDNSHP") -{ } +{ + IsBigEndian = SystemIsBigEndian(); +} // destructor BamReader::BamReaderPrivate::~BamReaderPrivate(void) { @@ -193,6 +203,137 @@ int BamReader::BamReaderPrivate::BinsFromRegion(int refID, int left, uint16_t li return i; } +bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData) { + + // calculate character lengths/offsets + const unsigned int dataLength = supportData.BlockLength - BAM_CORE_SIZE; + const unsigned int seqDataOffset = supportData.QueryNameLength + (supportData.NumCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (supportData.QuerySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + supportData.QuerySequenceLength; + const unsigned int tagDataLength = dataLength - tagDataOffset; + + // set up char buffers + const char* allCharData = supportData.AllCharData.data(); + const char* seqData = ((const char*)allCharData) + seqDataOffset; + const char* qualData = ((const char*)allCharData) + qualDataOffset; + char* tagData = ((char*)allCharData) + tagDataOffset; + + // save query sequence + bAlignment.QueryBases.clear(); + bAlignment.QueryBases.reserve(supportData.QuerySequenceLength); + for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) { + char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + bAlignment.QueryBases.append(1, singleBase); + } + + // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character + bAlignment.Qualities.clear(); + bAlignment.Qualities.reserve(supportData.QuerySequenceLength); + for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) { + char singleQuality = (char)(qualData[i]+33); + bAlignment.Qualities.append(1, singleQuality); + } + + // parse CIGAR to build 'AlignedBases' + bAlignment.AlignedBases.clear(); + bAlignment.AlignedBases.reserve(supportData.QuerySequenceLength); + + int k = 0; + vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin(); + vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + + const CigarOp& op = (*cigarIter); + switch(op.Type) { + + case ('M') : + case ('I') : + bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases + // fall through + + case ('S') : + k += op.Length; // for 'S' - soft clip, skip over query bases + break; + + case ('D') : + bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character + break; + + case ('P') : + bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character + break; + + case ('N') : + bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence + // k+=op.Length; + break; + + case ('H') : + break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op + + default: + printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here + exit(1); + } + } + + // ----------------------- + // Added: 3-25-2010 DWB + // Fixed: endian-correctness for tag data + // ----------------------- + if ( IsBigEndian ) { + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i+=2; // sizeof(uint16_t) + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i+=4; // sizeof(uint32_t) + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i+=8; // sizeof(uint64_t) + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + printf("ERROR: Invalid tag value type\n"); // shouldn't get here + exit(1); + } + } + } + + // store TagData + bAlignment.TagData.clear(); + bAlignment.TagData.resize(tagDataLength); + memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); + + // return success + return true; +} + // populates BAM index data structure from BAM file data bool BamReader::BamReaderPrivate::BuildIndex(void) { @@ -323,24 +464,6 @@ bool BamReader::BamReaderPrivate::BuildIndex(void) { return Rewind(); } -// calculates alignment end position based on starting position and provided CIGAR operations -int BamReader::BamReaderPrivate::CalculateAlignmentEnd(const int& position, const vector<CigarOp>& cigarData) { - - // initialize alignment end to starting position - int alignEnd = position; - - // iterate over cigar operations - vector<CigarOp>::const_iterator cigarIter = cigarData.begin(); - vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { - alignEnd += (*cigarIter).Length; - } - } - return alignEnd; -} - // clear index data structure void BamReader::BamReaderPrivate::ClearIndex(void) { @@ -361,17 +484,17 @@ bool BamReader::BamReaderPrivate::CreateIndex(void) { // clear out index ClearIndex(); - // build (& save) index from BAM file + // build (& save) index from BAM file bool ok = true; ok &= BuildIndex(); ok &= WriteIndex(); - // return success/fail + // return success/fail return ok; } // returns RefID for given RefName (returns References.size() if not found) -const int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { +int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { // retrieve names from reference data vector<string> refNames; @@ -388,20 +511,26 @@ const int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) con // get next alignment (from specified region, if given) bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { + BamAlignmentSupportData supportData; + // if valid alignment available - if ( LoadNextAlignment(bAlignment) ) { + if ( LoadNextAlignment(bAlignment, supportData) ) { // if region not specified, return success - if ( !IsRegionSpecified ) { return true; } + if ( !IsRegionSpecified ) { + bool ok = BuildCharData(bAlignment, supportData); + return ok; + } // load next alignment until region overlap is found while ( !IsOverlap(bAlignment) ) { // if no valid alignment available (likely EOF) return failure - if ( !LoadNextAlignment(bAlignment) ) { return false; } + if ( !LoadNextAlignment(bAlignment, supportData) ) { return false; } } // return success (alignment found that overlaps region) - return true; + bool ok = BuildCharData(bAlignment, supportData); + return ok; } // no valid alignment @@ -485,16 +614,13 @@ void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets const uint64_t& lastOffset) { // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) - 1) >> BAM_LIDX_SHIFT; + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; // resize vector if necessary int oldSize = offsets.size(); int newSize = endOffset + 1; - if ( oldSize < newSize ) { - Roundup32(newSize); - offsets.resize(newSize, 0); - } + if ( oldSize < newSize ) { offsets.resize(newSize, 0); } // store offset for(int i = beginOffset + 1; i <= endOffset ; ++i) { @@ -513,8 +639,8 @@ bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { // read starts after left boundary if ( bAlignment.Position >= CurrentLeft) { return true; } - // return whether alignment end overlaps left boundary - return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= CurrentLeft ); + // return whether alignment end overlaps left boundary + return ( bAlignment.GetEndPosition() >= CurrentLeft ); } // jumps to specified region(refID, leftBound) in BAM file, returns success/fail @@ -523,22 +649,22 @@ bool BamReader::BamReaderPrivate::Jump(int refID, int position) { // if data exists for this reference and position is valid if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { - // set current region + // set current region CurrentRefID = refID; CurrentLeft = position; IsRegionSpecified = true; - // calculate offset + // calculate offset int64_t offset = GetOffset(CurrentRefID, CurrentLeft); - // if in valid offset, return failure + // if in valid offset, return failure if ( offset == -1 ) { return false; } - // otherwise return success of seek operation + // otherwise return success of seek operation else { return mBGZF.Seek(offset); } } - // invalid jump request parameters, return failure + // invalid jump request parameters, return failure return false; } @@ -559,8 +685,9 @@ void BamReader::BamReaderPrivate::LoadHeaderData(void) { // get BAM header text length mBGZF.Read(buffer, 4); - const unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); - + unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(headerTextLength); } + // get BAM header text char* headerText = (char*)calloc(headerTextLength + 1, 1); mBGZF.Read(headerText, headerTextLength); @@ -586,8 +713,8 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { return false; } - size_t elementsRead = 0; - + size_t elementsRead = 0; + // see if index is valid BAM index char magic[4]; elementsRead = fread(magic, 1, 4, indexStream); @@ -600,7 +727,8 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { // get number of reference sequences uint32_t numRefSeqs; elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - + if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); } + // intialize space for BamIndex data structure Index.reserve(numRefSeqs); @@ -610,6 +738,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { // get number of bins for this reference sequence int32_t numBins; elementsRead = fread(&numBins, 4, 1, indexStream); + if ( IsBigEndian ) { SwapEndian_32(numBins); } if (numBins > 0) { RefData& refEntry = References[i]; @@ -630,6 +759,11 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { uint32_t numChunks; elementsRead = fread(&numChunks, 4, 1, indexStream); + if ( IsBigEndian ) { + SwapEndian_32(binID); + SwapEndian_32(numChunks); + } + // intialize ChunkVector ChunkVector regionChunks; regionChunks.reserve(numChunks); @@ -643,6 +777,11 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { elementsRead = fread(&left, 8, 1, indexStream); elementsRead = fread(&right, 8, 1, indexStream); + if ( IsBigEndian ) { + SwapEndian_64(left); + SwapEndian_64(right); + } + // save ChunkPair regionChunks.push_back( Chunk(left, right) ); } @@ -659,6 +798,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { // get number of linear offsets int32_t numLinearOffsets; elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); + if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); } // intialize LinearOffsetVector LinearOffsetVector offsets; @@ -669,6 +809,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { for (int j = 0; j < numLinearOffsets; ++j) { // read a linear offset & store elementsRead = fread(&linearOffset, 8, 1, indexStream); + if ( IsBigEndian ) { SwapEndian_64(linearOffset); } offsets.push_back(linearOffset); } @@ -685,131 +826,76 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { } // populates BamAlignment with alignment data under file pointer, returns success/fail -bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { +bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) { // read in the 'block length' value, make sure it's not zero char buffer[4]; mBGZF.Read(buffer, 4); - const unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer); - if ( blockLength == 0 ) { return false; } - - // keep track of bytes read as method progresses - int bytesRead = 4; + supportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(supportData.BlockLength); } + if ( supportData.BlockLength == 0 ) { return false; } // read in core alignment data, make sure the right size of data was read char x[BAM_CORE_SIZE]; if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } - bytesRead += BAM_CORE_SIZE; - // set BamAlignment 'core' data and character data lengths - unsigned int tempValue; - unsigned int queryNameLength; - unsigned int numCigarOperations; - unsigned int querySequenceLength; - - bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); + if ( IsBigEndian ) { + for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { + SwapEndian_32p(&x[i]); + } + } + + // set BamAlignment 'core' and 'support' data + bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); - - tempValue = BgzfData::UnpackUnsignedInt(&x[8]); + + unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); bAlignment.Bin = tempValue >> 16; bAlignment.MapQuality = tempValue >> 8 & 0xff; - queryNameLength = tempValue & 0xff; + supportData.QueryNameLength = tempValue & 0xff; - tempValue = BgzfData::UnpackUnsignedInt(&x[12]); + tempValue = BgzfData::UnpackUnsignedInt(&x[12]); bAlignment.AlignmentFlag = tempValue >> 16; - numCigarOperations = tempValue & 0xffff; + supportData.NumCigarOperations = tempValue & 0xffff; - querySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); + supportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); - - // calculate lengths/offsets - const unsigned int dataLength = blockLength - BAM_CORE_SIZE; - const unsigned int cigarDataOffset = queryNameLength; - const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + querySequenceLength; - const unsigned int tagDataLen = dataLength - tagDataOffset; - - // set up destination buffers for character data - char* allCharData = (char*)calloc(sizeof(char), dataLength); - uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); - char* seqData = ((char*)allCharData) + seqDataOffset; - char* qualData = ((char*)allCharData) + qualDataOffset; - char* tagData = ((char*)allCharData) + tagDataOffset; - - // get character data - make sure proper data size was read + + // store 'all char data' and cigar ops + const unsigned int dataLength = supportData.BlockLength - BAM_CORE_SIZE; + const unsigned int cigarDataOffset = supportData.QueryNameLength; + + char* allCharData = (char*)calloc(sizeof(char), dataLength); + uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + + // read in character data - make sure proper data size was read if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; } else { - - bytesRead += dataLength; - - // clear out any previous string data - bAlignment.Name.clear(); - bAlignment.QueryBases.clear(); - bAlignment.Qualities.clear(); - bAlignment.AlignedBases.clear(); + + // store alignment name and length + bAlignment.Name.assign((const char*)(allCharData)); + bAlignment.Length = supportData.QuerySequenceLength; + + // store remaining 'allCharData' in supportData structure + supportData.AllCharData.assign((const char*)allCharData, dataLength); + + // save CigarOps for BamAlignment bAlignment.CigarData.clear(); - bAlignment.TagData.clear(); - - // save name - bAlignment.Name = (string)((const char*)(allCharData)); - - // save query sequence - for (unsigned int i = 0; i < querySequenceLength; ++i) { - char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; - bAlignment.QueryBases.append( 1, singleBase ); - } - - // save sequence length - bAlignment.Length = bAlignment.QueryBases.length(); - - // save qualities, convert from numeric QV to FASTQ character - for (unsigned int i = 0; i < querySequenceLength; ++i) { - char singleQuality = (char)(qualData[i]+33); - bAlignment.Qualities.append( 1, singleQuality ); - } + for (unsigned int i = 0; i < supportData.NumCigarOperations; ++i) { - // save CIGAR-related data; - int k = 0; - for (unsigned int i = 0; i < numCigarOperations; ++i) { - - // build CigarOp struct + // swap if necessary + if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); } + + // build CigarOp structure CigarOp op; op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; // save CigarOp bAlignment.CigarData.push_back(op); - - // build AlignedBases string - switch (op.Type) { - case ('M') : - case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases - case ('S') : k += op.Length; // for 'S' - skip over query bases - break; - - case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character - break; - - case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character; - break; - - case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence - //k += op.Length; - break; - - case ('H') : break; // for 'H' - do nothing, move to next op - - default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } } - - // read in the tag data - bAlignment.TagData.resize(tagDataLen); - memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen); } free(allCharData); @@ -822,7 +908,8 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { // get number of reference sequences char buffer[4]; mBGZF.Read(buffer, 4); - const unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); + unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); } if (numberRefSeqs == 0) { return; } References.reserve((int)numberRefSeqs); @@ -831,13 +918,15 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { // get length of reference name mBGZF.Read(buffer, 4); - const unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); + unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(refNameLength); } char* refName = (char*)calloc(refNameLength, 1); // get reference name and reference sequence length mBGZF.Read(refName, refNameLength); mBGZF.Read(buffer, 4); - const int refLength = BgzfData::UnpackSignedInt(buffer); + int refLength = BgzfData::UnpackSignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(refLength); } // store data for reference RefData aReference; @@ -946,17 +1035,6 @@ bool BamReader::BamReaderPrivate::Rewind(void) { return mBGZF.Seek(AlignmentsBeginOffset); } -// rounds value up to next power-of-2 (used in index building) -void BamReader::BamReaderPrivate::Roundup32(int& value) { - --value; - value |= value >> 1; - value |= value >> 2; - value |= value >> 4; - value |= value >> 8; - value |= value >> 16; - ++value; -} - // saves index data to BAM index file (".bai"), returns success/fail bool BamReader::BamReaderPrivate::WriteIndex(void) { @@ -972,6 +1050,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { // write number of reference sequences int32_t numReferenceSeqs = Index.size(); + if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); } fwrite(&numReferenceSeqs, 4, 1, indexStream); // iterate over reference sequences @@ -986,6 +1065,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { // write number of bins int32_t binCount = binMap.size(); + if ( IsBigEndian ) { SwapEndian_32(binCount); } fwrite(&binCount, 4, 1, indexStream); // iterate over bins @@ -994,14 +1074,16 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { for ( ; binIter != binEnd; ++binIter ) { // get bin data (key and chunk vector) - const uint32_t& binKey = (*binIter).first; + uint32_t binKey = (*binIter).first; const ChunkVector& binChunks = (*binIter).second; // save BAM bin key + if ( IsBigEndian ) { SwapEndian_32(binKey); } fwrite(&binKey, 4, 1, indexStream); // save chunk count int32_t chunkCount = binChunks.size(); + if ( IsBigEndian ) { SwapEndian_32(chunkCount); } fwrite(&chunkCount, 4, 1, indexStream); // iterate over chunks @@ -1011,9 +1093,14 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { // get current chunk data const Chunk& chunk = (*chunkIter); - const uint64_t& start = chunk.Start; - const uint64_t& stop = chunk.Stop; + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + if ( IsBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + // save chunk offsets fwrite(&start, 8, 1, indexStream); fwrite(&stop, 8, 1, indexStream); @@ -1022,6 +1109,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { // write linear offsets size int32_t offsetSize = offsets.size(); + if ( IsBigEndian ) { SwapEndian_32(offsetSize); } fwrite(&offsetSize, 4, 1, indexStream); // iterate over linear offsets @@ -1030,7 +1118,8 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { for ( ; offsetIter != offsetEnd; ++offsetIter ) { // write linear offset value - const uint64_t& linearOffset = (*offsetIter); + uint64_t linearOffset = (*offsetIter); + if ( IsBigEndian ) { SwapEndian_64(linearOffset); } fwrite(&linearOffset, 8, 1, indexStream); } } diff --git a/src/utils/BamTools/BamReader.h b/src/utils/BamTools/BamReader.h index 6332cde7..8047d7a7 100644 --- a/src/utils/BamTools/BamReader.h +++ b/src/utils/BamTools/BamReader.h @@ -1,9 +1,9 @@ // *************************************************************************** -// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg +// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 8 December 2009 (DB) +// Last modified: 30 March 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -59,11 +59,11 @@ class BamReader { // returns SAM header text const std::string GetHeaderText(void) const; // returns number of reference sequences - const int GetReferenceCount(void) const; + int GetReferenceCount(void) const; // returns vector of reference objects const BamTools::RefVector GetReferenceData(void) const; // returns reference id (used for BamReader::Jump()) for the given reference name - const int GetReferenceID(const std::string& refName) const; + int GetReferenceID(const std::string& refName) const; // ---------------------- // BAM index operations diff --git a/src/utils/BamTools/BamWriter.cpp b/src/utils/BamTools/BamWriter.cpp index d3363807..2cd2742d 100644 --- a/src/utils/BamTools/BamWriter.cpp +++ b/src/utils/BamTools/BamWriter.cpp @@ -1,9 +1,9 @@ // *************************************************************************** -// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett +// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 8 December 2009 (DB) +// Last modified: 30 March 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -21,9 +21,14 @@ struct BamWriter::BamWriterPrivate { // data members BgzfData mBGZF; - + bool IsBigEndian; + + // constructor / destructor - BamWriterPrivate(void) { } + BamWriterPrivate(void) { + IsBigEndian = SystemIsBigEndian(); + } + ~BamWriterPrivate(void) { mBGZF.Close(); } @@ -139,27 +144,34 @@ void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, strin while(*pQuery) { switch(*pQuery) { + case '=': - nucleotideCode = 0; - break; + nucleotideCode = 0; + break; + case 'A': - nucleotideCode = 1; - break; + nucleotideCode = 1; + break; + case 'C': - nucleotideCode = 2; - break; + nucleotideCode = 2; + break; + case 'G': - nucleotideCode = 4; - break; + nucleotideCode = 4; + break; + case 'T': - nucleotideCode = 8; - break; + nucleotideCode = 8; + break; + case 'N': - nucleotideCode = 15; - break; + nucleotideCode = 15; + break; + default: - printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); + printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); } // pack the nucleotide code @@ -193,7 +205,8 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); // write the SAM header text length - const unsigned int samHeaderLen = samHeader.size(); + uint32_t samHeaderLen = samHeader.size(); + if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); } mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); // write the SAM header text @@ -202,7 +215,8 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam } // write the number of reference sequences - const unsigned int numReferenceSequences = referenceSequences.size(); + uint32_t numReferenceSequences = referenceSequences.size(); + if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); } mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); // ============================= @@ -213,14 +227,17 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { // write the reference sequence name length - const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1; + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); } mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); // write the reference sequence name mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); // write the reference sequence length - mBGZF.Write((char*)&rsIter->RefLength, BT_SIZEOF_INT); + int32_t referenceLength = rsIter->RefLength; + if ( IsBigEndian ) { SwapEndian_32(referenceLength); } + mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); } } @@ -243,10 +260,17 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { const unsigned int encodedQueryLen = encodedQuery.size(); // store the tag data length + // ------------------------------------------- + // Modified: 3-25-2010 DWB + // Contributed: ARQ + // Fixed: "off by one" error when parsing tags in files produced by BamWriter const unsigned int tagDataLength = al.TagData.size(); - + // original line: + // const unsigned int tagDataLength = al.TagData.size() + 1; + // ------------------------------------------- + // assign the BAM core data - unsigned int buffer[8]; + uint32_t buffer[8]; buffer[0] = al.RefID; buffer[1] = al.Position; buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen; @@ -257,18 +281,40 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { buffer[7] = al.InsertSize; // write the block size - const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength; - const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength; + unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + if ( IsBigEndian ) { SwapEndian_32(blockSize); } mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); // write the BAM core + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) { + SwapEndian_32(buffer[i]); + } + } mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); // write the query name mBGZF.Write(al.Name.c_str(), nameLen); // write the packed cigar - mBGZF.Write(packedCigar.data(), packedCigarLen); + if ( IsBigEndian ) { + + char* cigarData = (char*)calloc(sizeof(char), packedCigarLen); + memcpy(cigarData, packedCigar.data(), packedCigarLen); + + for (unsigned int i = 0; i < packedCigarLen; ++i) { + if ( IsBigEndian ) { + SwapEndian_32p(&cigarData[i]); + } + } + + mBGZF.Write(cigarData, packedCigarLen); + free(cigarData); + + } else { + mBGZF.Write(packedCigar.data(), packedCigarLen); + } // write the encoded query sequence mBGZF.Write(encodedQuery.data(), encodedQueryLen); @@ -280,5 +326,57 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { mBGZF.Write(pBaseQualities, queryLen); // write the read group tag - mBGZF.Write(al.TagData.data(), tagDataLength); + if ( IsBigEndian ) { + + char* tagData = (char*)calloc(sizeof(char), tagDataLength); + memcpy(tagData, al.TagData.data(), tagDataLength); + + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i+=2; // sizeof(uint16_t) + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i+=4; // sizeof(uint32_t) + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i+=8; // sizeof(uint64_t) + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + printf("ERROR: Invalid tag value type\n"); // shouldn't get here + free(tagData); + exit(1); + } + } + + mBGZF.Write(tagData, tagDataLength); + free(tagData); + } else { + mBGZF.Write(al.TagData.data(), tagDataLength); + } } -- GitLab