From b9ab28d05ce8051f4631f6469fd2c9578978840c Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Wed, 18 Aug 2010 17:09:55 -0400 Subject: [PATCH] Big commit. 1. Added new version of BamTools supporting uncompressed BAM. 2. Added -ubam option to intersectBed. 3. First functional release of annotateBed. 4. Updated GetEditDistance calls to GetTag(NM) per the new API change in BamTools. --- Makefile | 3 +- src/annotateBed/Makefile | 13 +- src/annotateBed/annotateBed.cpp | 263 +++++--- src/annotateBed/annotateBed.h | 54 +- src/annotateBed/annotateMain.cpp | 107 ++- src/bamToBed/bamToBed.cpp | 16 +- src/intersectBed/intersectBed.cpp | 7 +- src/intersectBed/intersectBed.h | 3 +- src/intersectBed/intersectMain.cpp | 7 +- src/pairToBed/pairToBed.h | 6 +- src/utils/BamTools/BGZF.cpp | 109 +-- src/utils/BamTools/BGZF.h | 134 ++-- src/utils/BamTools/BamAux.h | 930 ++++++++++++++++---------- src/utils/BamTools/BamIndex.cpp | 8 +- src/utils/BamTools/BamIndex.h | 5 +- src/utils/BamTools/BamMultiReader.cpp | 2 +- src/utils/BamTools/BamReader.cpp | 4 +- src/utils/BamTools/BamWriter.cpp | 18 +- src/utils/BamTools/BamWriter.h | 7 +- src/utils/bedFile/bedFile.cpp | 68 ++ src/utils/bedFile/bedFile.h | 43 +- 21 files changed, 1158 insertions(+), 649 deletions(-) diff --git a/Makefile b/Makefile index 363fbd1b..9393e69f 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,8 @@ export CXX = g++ export CXXFLAGS = -Wall -O2 export LIBS = -lz -SUBDIRS = $(SRC_DIR)/bamToBed \ +SUBDIRS = $(SRC_DIR)/annotateBed \ + $(SRC_DIR)/bamToBed \ $(SRC_DIR)/bedToBam \ $(SRC_DIR)/bedToIgv \ $(SRC_DIR)/bed12ToBed6 \ diff --git a/src/annotateBed/Makefile b/src/annotateBed/Makefile index e15bf8e4..e87944c0 100644 --- a/src/annotateBed/Makefile +++ b/src/annotateBed/Makefile @@ -5,17 +5,17 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= coverageMain.cpp coverageBed.cpp +SOURCES= annotateMain.cpp annotateBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o lineFileUtilities.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o BamReader.o BamAncillary.o BGZF.o BamIndex.o gzstream.o fileType.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) -PROGRAM= coverageBed +PROGRAM= annotateBed all: $(PROGRAM) @@ -33,7 +33,10 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/annotateBed/annotateBed.cpp b/src/annotateBed/annotateBed.cpp index b270c8e3..ad46f3c7 100644 --- a/src/annotateBed/annotateBed.cpp +++ b/src/annotateBed/annotateBed.cpp @@ -12,125 +12,190 @@ #include "lineFileUtilities.h" #include "annotateBed.h" +// build +BedAnnotate::BedAnnotate(const string &mainFile, const vector<string> &annoFileNames, + const vector<string> &annoTitles, bool forceStrand, bool reportCounts, bool reportBoth) : + + _mainFile(mainFile), + _annoFileNames(annoFileNames), + _annoTitles(annoTitles), + _forceStrand(_forceStrand), + _reportCounts(reportCounts), + _reportBoth(reportBoth) +{ + _bed = new BedFile(_mainFile); +} + + +// destroy and delete the open file pointers +BedAnnotate::~BedAnnotate(void) { + delete _bed; + CloseAnnoFiles(); +} -BedAnnotate::BedAnnotate (const string &bedFile, const vector<string> &annotationFiles, bool &forceStrand) { - this->bedFile = bedFile; - this->annotationFiles = annotationFiles; - - this->bed = new BedFile(bedFile); - this->forceStrand = forceStrand; +void BedAnnotate::OpenAnnoFiles() { + for (size_t i=0; i < _annoFileNames.size(); ++i) { + BedFile *file = new BedFile(_annoFileNames[i]); + file->Open(); + _annoFiles.push_back(file); + } +} + + +void BedAnnotate::CloseAnnoFiles() { + for (size_t i=0; i < _annoFiles.size(); ++i) { + BedFile *file = _annoFiles[i]; + delete file; + _annoFiles[i] = NULL; + } } +void BedAnnotate::PrintHeader() { + // print a hash to indicate header and then write a tab + // for each field in the main file. + printf("#"); + for (size_t i = 0; i < _bed->bedType; ++i) + printf("\t"); -BedAnnotate::~BedAnnotate (void) { + // now print the label for each file. + for (size_t i = 0; i < _annoTitles.size(); ++i) + printf("%s\t", _annoTitles[i].c_str()); + printf("\n"); } -void BedAnnotate::ProcessAnnotations (istream &bedInput) { - - // loop through each of the annotation files and compute the coverage - // of each with respect to the input BED file. - vector<string>::const_iterator annotItr = this->annotationFiles.begin(); - vector<string>::const_iterator annotEnd = this->annotationFiles.end(); - for (; annotItr != annotEnd; ++annotItr) { - // create a BED file of the current annotation file. - BedFile annotation = new BedFile(*annotItr); - // compute the coverage of the annotation file with respect to the input file. - GetCoverage(annotation, bedInput); - } +void BedAnnotate::InitializeMainFile() { + // process each chromosome + masterBedCovListMap::iterator chromItr = _bed->bedCovListMap.begin(); + masterBedCovListMap::iterator chromEnd = _bed->bedCovListMap.end(); + for (; chromItr != chromEnd; ++chromItr) { + // for each chrom, process each bin + binsToBedCovLists::iterator binItr = chromItr->second.begin(); + binsToBedCovLists::iterator binEnd = chromItr->second.end(); + for (; binItr != binEnd; ++binItr) { + // initialize BEDCOVLIST in this chrom/bin + vector<BEDCOVLIST>::iterator bedItr = binItr->second.begin(); + vector<BEDCOVLIST>::iterator bedEnd = binItr->second.end(); + for (; bedItr != bedEnd; ++bedItr) { + // initialize the depthMaps, counts, etc. for each anno file. + for (size_t i = 0; i < _annoFiles.size(); ++i) { + map<unsigned int, DEPTH> dummy; + bedItr->depthMapList.push_back(dummy); + bedItr->counts.push_back(0); + bedItr->minOverlapStarts.push_back(INT_MAX); + } + } + } + } } - -void BedAnnotate::GetCoverage (const BedFile &annotation, istream &bedInput) { + +void BedAnnotate::AnnotateBed() { - // load the annotation bed file into a map so - // that we can easily compare "A" to it for overlaps - annotation->loadBedFileIntoMap(); - - string bedLine; - int lineNum = 0; // current input line number - vector<string> bedFields; // vector for a BED entry - bedFields.reserve(12); - - // process each entry in A - while (getline(bedInput, bedLine)) { - - lineNum++; - Tokenize(bedLine,bedFields); - BED a; - - if (bedA->parseLine(a, bedFields, lineNum)) { - // count a as a hit with all the relevant features in B - bedB->countHits(a, this->forceStrand); - } - // reset for the next input line - bedFields.clear(); + // load the "main" bed file into a map so + // that we can easily compare each annoFile to it for overlaps + _bed->loadBedCovListFileIntoMap(); + // open the annotations files for processing; + OpenAnnoFiles(); + // initialize counters, depths, etc. for the main file + InitializeMainFile(); + + // annotate the main file with the coverage from the annotation files. + for (size_t annoIndex = 0; annoIndex < _annoFiles.size(); ++annoIndex) { + // grab the current annotation file. + BedFile *anno = _annoFiles[annoIndex]; + int lineNum = 0; + BED a, nullBed; + BedLineStatus bedStatus; + + // process each entry in the current anno file + while ((bedStatus = anno->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + _bed->countListHits(a, annoIndex, _forceStrand); + a = nullBed; + } + } } - - // now, report the count of hits for each feature in B. - masterBedMap::const_iterator chromItr = bedB->bedMap.begin(); - masterBedMap::const_iterator chromEnd = bedB->bedMap.end(); - for (; chromItr != chromEnd; ++chromItr) { - binsToBeds::const_iterator binItr = chromItr->second.begin(); - binsToBeds::const_iterator binEnd = chromItr->second.end(); - for (; binItr != binEnd; ++binItr) { + // report the annotations of the main file from the anno file. + ReportAnnotations(); + // close the annotations files; + CloseAnnoFiles(); +} - vector<BED>::const_iterator bedItr = binItr->second.begin(); - vector<BED>::const_iterator bedEnd = binItr->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - - int zeroDepthCount = 0; - int depth = 0; - int start = min(bedItr->minOverlapStart, bedItr->start); - - for (int pos = start+1; pos <= bedItr->end; pos++) { - - if (bedItr->depthMap.find(pos) != bedItr->depthMap.end()) { - - map<unsigned int, DEPTH> dMap = bedItr->depthMap; - depth += dMap[pos].starts; - - if ((depth == 0) && (pos > bedItr->start) && (pos <= bedItr->end)) { - zeroDepthCount++; - } - - depth = depth - dMap[pos].ends; - } - else { - if ((depth == 0) && (pos > bedItr->start) && (pos <= bedItr->end)) { - zeroDepthCount++; - } - } - } - // Report the coverage for the current interval. - int length = bedItr->end - bedItr->start; - int nonZeroBases = (length-zeroDepthCount); - float fractCovered = (float) nonZeroBases /length; +void BedAnnotate::ReportAnnotations() { + + if (_annoTitles.size() > 0) { + PrintHeader(); + } + + // process each chromosome + masterBedCovListMap::const_iterator chromItr = _bed->bedCovListMap.begin(); + masterBedCovListMap::const_iterator chromEnd = _bed->bedCovListMap.end(); + for (; chromItr != chromEnd; ++chromItr) { + // for each chrom, process each bin + binsToBedCovLists::const_iterator binItr = chromItr->second.begin(); + binsToBedCovLists::const_iterator binEnd = chromItr->second.end(); + for (; binItr != binEnd; ++binItr) { + // for each chrom & bin, compute and report + // the observed coverage for each feature + vector<BEDCOVLIST>::const_iterator bedItr = binItr->second.begin(); + vector<BEDCOVLIST>::const_iterator bedEnd = binItr->second.end(); + for (; bedItr != bedEnd; ++bedItr) { + // print the main BED entry. + _bed->reportBedTab(*bedItr); + + // now report the coverage from each annotation file. + for (size_t i = 0; i < _annoFiles.size(); ++i) { + unsigned int totalLength = 0; + int zeroDepthCount = 0; // number of bases with zero depth + int depth = 0; // tracks the depth at the current base - bedB->reportBedTab(*bedItr); - printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, nonZeroBases, length, fractCovered); + // the start is either the first base in the feature OR + // the leftmost position of an overlapping feature. e.g. (s = start): + // A ---------- + // B s ------------ + int start = min(bedItr->minOverlapStarts[i], bedItr->start); + + map<unsigned int, DEPTH>::const_iterator depthItr; + map<unsigned int, DEPTH>::const_iterator depthEnd; + + // compute the coverage observed at each base in the feature marching from start to end. + for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) { + // map pointer grabbing the starts and ends observed at this position + depthItr = bedItr->depthMapList[i].find(pos); + depthEnd = bedItr->depthMapList[i].end(); + + // increment coverage if starts observed at this position. + if (depthItr != depthEnd) + depth += depthItr->second.starts; + // update zero depth + if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0)) + zeroDepthCount++; + // decrement coverage if ends observed at this position. + if (depthItr != depthEnd) + depth = depth - depthItr->second.ends; + } + // Summarize the coverage for the current interval, + CHRPOS length = bedItr->end - bedItr->start; + totalLength += length; + int nonZeroBases = (length - zeroDepthCount); + float fractCovered = (float) nonZeroBases / length; + if (_reportCounts == false && _reportBoth == false) + printf("%f\t", fractCovered); + else if (_reportCounts == true && _reportBoth == false) + printf("%d\t", bedItr->counts[i]); + else if (_reportCounts == false && _reportBoth == true) + printf("%d,%f\t", bedItr->counts[i], fractCovered); + } + // print newline for next feature. + printf("\n"); } } } } -void BedAnnotate::DetermineBedInput () { - if (bedA->bedFile != "stdin") { // process a file - ifstream beds(bedA->bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - GetCoverage(beds); - } - else { // process stdin - GetCoverage(cin); - } -} - - diff --git a/src/annotateBed/annotateBed.h b/src/annotateBed/annotateBed.h index 25d340a0..0d948905 100644 --- a/src/annotateBed/annotateBed.h +++ b/src/annotateBed/annotateBed.h @@ -1,5 +1,5 @@ /***************************************************************************** - coverageBed.h + annotateBed.h (c) 2009 - Aaron Quinlan Hall Laboratory @@ -9,10 +9,16 @@ Licenced under the GNU General Public License 2.0+ license. ******************************************************************************/ -#ifndef COVERAGEBED_H -#define COVERAGEBED_H +#ifndef ANNOTATEBED_H +#define ANNOTATEBED_H #include "bedFile.h" + +#include "BamReader.h" +#include "BamAux.h" +#include "BamAncillary.h" +using namespace BamTools; + #include <vector> #include <algorithm> #include <iostream> @@ -25,29 +31,45 @@ using namespace std; //************************************************ // Class methods and elements //************************************************ -class BedCoverage { +class BedAnnotate { public: // constructor - BedCoverage(string &, string &, bool &); + BedAnnotate(const string &mainFile, const vector<string> &annoFileNames, + const vector<string> &annoTitles, bool forceStrand, bool reportCounts, bool reportBoth); // destructor - ~BedCoverage(void); - - void GetCoverage(istream &bedInput); + ~BedAnnotate(void); - void DetermineBedInput(); + // annotate the master file with all of the annotation files. + void AnnotateBed(); private: - string bedAFile; - string bedBFile; - + // input files. + string _mainFile; + vector<string> _annoFileNames; + vector<string> _annoTitles; + // instance of a bed file class. - BedFile *bedA, *bedB; + BedFile *_bed; + vector<BedFile*> _annoFiles; - bool forceStrand; - + // do we care about strandedness when counting coverage? + bool _forceStrand; + bool _reportCounts; + bool _reportBoth; + + // private function for reporting coverage information + void ReportAnnotations(); + + void OpenAnnoFiles(); + + void CloseAnnoFiles(); + + void PrintHeader(); + + void InitializeMainFile(); }; -#endif /* COVERAGEBED_H */ +#endif /* ANNOTATEBED_H */ diff --git a/src/annotateBed/annotateMain.cpp b/src/annotateBed/annotateMain.cpp index db41fc28..3bdd1e4e 100644 --- a/src/annotateBed/annotateMain.cpp +++ b/src/annotateBed/annotateMain.cpp @@ -1,5 +1,5 @@ /***************************************************************************** - coverageMain.cpp + annotateMain.cpp (c) 2009 - Aaron Quinlan Hall Laboratory @@ -9,13 +9,13 @@ Licenced under the GNU General Public License 2.0+ license. ******************************************************************************/ -#include "coverageBed.h" +#include "annotateBed.h" #include "version.h" using namespace std; // define the version -#define PROGRAM_NAME "coverageBed" +#define PROGRAM_NAME "annotateBed" // define our parameter checking macro #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) @@ -28,15 +28,21 @@ int main(int argc, char* argv[]) { // our configuration variables bool showHelp = false; - // input files - string bedAFile; - string bedBFile; - bool forceStrand = false; - - bool haveBedA = false; - bool haveBedB = false; - - + // input file + string mainFile; + + // parm flags + bool forceStrand = false; + bool haveBed = false; + bool haveFiles = false; + bool haveTitles = false; + bool reportCounts = false; + bool reportBoth = false; + + // list of annotation files / names + vector<string> inputFiles; + vector<string> inputTitles; + // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -56,20 +62,47 @@ int main(int argc, char* argv[]) { int parameterLength = (int)strlen(argv[i]); - if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if(PARAMETER_CHECK("-i", 2, parameterLength)) { if ((i+1) < argc) { - haveBedA = true; - bedAFile = argv[i + 1]; + haveBed = true; + mainFile = argv[i + 1]; i++; } } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + else if(PARAMETER_CHECK("-files", 6, parameterLength)) { if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; + haveFiles = true; + i = i+1; + string file = argv[i]; + while (file[0] != '-' && i < argc) { + inputFiles.push_back(file); + i++; + if (i < argc) + file = argv[i]; + } + i--; + } + } + else if(PARAMETER_CHECK("-names", 6, parameterLength)) { + if ((i+1) < argc) { + haveTitles = true; + i = i+1; + string title = argv[i]; + while (title[0] != '-' && i < argc) { + inputTitles.push_back(title); + i++; + if (i < argc) + title = argv[i]; + } + i--; } } + else if(PARAMETER_CHECK("-counts", 7, parameterLength)) { + reportCounts = true; + } + else if(PARAMETER_CHECK("-both", 5, parameterLength)) { + reportBoth = true; + } else if (PARAMETER_CHECK("-s", 2, parameterLength)) { forceStrand = true; } @@ -80,15 +113,15 @@ int main(int argc, char* argv[]) { } // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + if (!haveBed || !haveFiles) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i and -files files. " << endl << "*****" << endl; showHelp = true; } if (!showHelp) { - - BedCoverage *bg = new BedCoverage(bedAFile, bedBFile, forceStrand); - bg->DetermineBedInput(); + BedAnnotate *ba = new BedAnnotate(mainFile, inputFiles, inputTitles, forceStrand, reportCounts, reportBoth); + ba->AnnotateBed(); + delete ba; return 0; } else { @@ -102,23 +135,25 @@ void ShowHelp(void) { cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Summary: Returns the depth and breadth of coverage of features from A" << endl; - cerr << "\t on the intervals in B." << endl << endl; + cerr << "Summary: Annotates the depth & breadth of coverage of features from multiple files" << endl; + cerr << "\t on the intervals in -i." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -files FILE1 FILE2 .. FILEn" << endl << endl; cerr << "Options: " << endl; - cerr << "\t-s\t" << "Force strandedness. That is, only include hits in A that" << endl; + + cerr << "\t-names\t" << "A list of names (one / file) to describe each file in -i." << endl; + cerr << "\t\tThese names will be printed as a header line." << endl << endl; + + cerr << "\t-counts\t" << "Report the count of features in each file that overlap -i." << endl; + cerr << "\t\t- Default is to report the fraction of -i covered by each file." << endl << endl; + + cerr << "\t-both\t" << "Report the counts and % coverage." << endl; + cerr << "\t\t- Default is to report the fraction of -i covered by each file." << endl << endl; + + cerr << "\t-s\t" << "Force strandedness. That is, only include hits in A that" << endl; cerr << "\t\toverlap B on the same strand." << endl; cerr << "\t\t- By default, hits are included without respect to strand." << endl << endl; - - cerr << "Output: " << endl; - cerr << "\t" << " After each entry in B, reports: " << endl; - cerr << "\t 1) The number of features in A that overlapped the B interval." << endl; - cerr << "\t 2) The number of bases in B that had non-zero coverage." << endl; - cerr << "\t 3) The length of the entry in B." << endl; - cerr << "\t 4) The fraction of bases in B that had non-zero coverage." << endl; exit(1); - } diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index d5b29dfb..154cbfc3 100644 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -322,9 +322,9 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); } - else if (useEditDistance == true && bamTag != "") { - uint8_t editDistance; - if (bam.GetEditDistance(editDistance)) { + else if (useEditDistance == true && bamTag == "") { + uint32_t editDistance; + if (bam.GetTag("NM", editDistance)) { printf("%s\t%d\t%d\t\%s\t%u\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, alignmentEnd, name.c_str(), editDistance, strand.c_str()); } @@ -390,8 +390,8 @@ void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDist alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); } else if (useEditDistance == true && bamTag != "") { - uint8_t editDistance; - if (bam.GetEditDistance(editDistance)) { + uint32_t editDistance; + if (bam.GetTag("NM", editDistance)) { printf("%s\t%d\t%d\t\%s\t%u\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, alignmentEnd, name.c_str(), editDistance, strand.c_str()); } @@ -435,7 +435,7 @@ void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVec // initialize BEDPE variables string chrom1, chrom2, strand1, strand2; int start1, start2, end1, end2; - uint8_t editDistance1, editDistance2; + uint32_t editDistance1, editDistance2; start1 = start2 = end1 = end2 = -1; chrom1 = chrom2 = strand1 = strand2 = "."; editDistance1 = editDistance2 = 0; @@ -451,7 +451,7 @@ void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVec // extract the edit distance from the NM tag // if possible. otherwise, complain. - if (useEditDistance == true && bam1.GetEditDistance(editDistance1) == false) { + if (useEditDistance == true && bam1.GetTag("NM", editDistance1) == false) { cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; exit(1); } @@ -467,7 +467,7 @@ void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVec // extract the edit distance from the NM tag // if possible. otherwise, complain. - if (useEditDistance == true && bam2.GetEditDistance(editDistance2) == false) { + if (useEditDistance == true && bam2.GetTag("NM", editDistance2) == false) { cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; exit(1); } diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index 9452bafd..1535703d 100644 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -66,7 +66,7 @@ bool BedIntersect::processHits(const BED &a, const vector<BED> &hits, bool print BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool forceStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam) { _bedAFile = bedAFile; _bedBFile = bedBFile; @@ -83,6 +83,7 @@ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, _obeySplits = obeySplits; _bamInput = bamInput; _bamOutput = bamOutput; + _isUncompressedBam = isUncompressedBam; // create new BED file objects for A and B _bedA = new BedFile(bedAFile); @@ -254,10 +255,10 @@ void BedIntersect::IntersectBam(string bamFile) { // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // open our BAM writer - writer.Open("stdout", header, refs); + writer.Open("stdout", header, refs, _isUncompressedBam); } - vector<BED> hits; // vector of potential hits + vector<BED> hits; // reserve some space hits.reserve(100); diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index f4d2556a..ecfb5c57 100644 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -36,7 +36,7 @@ public: BedIntersect(string bedAFile, string bedBFile, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool forceStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput); + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam); // destructor ~BedIntersect(void); @@ -64,6 +64,7 @@ private: bool _obeySplits; bool _bamInput; bool _bamOutput; + bool _isUncompressedBam; // instance of a bed file class. BedFile *_bedA, *_bedB; diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index 84d4d3f5..1e7e2617 100644 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { bool obeySplits = false; bool inputIsBam = false; bool outputIsBam = true; - + bool uncompressedBam = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -134,6 +134,9 @@ int main(int argc, char* argv[]) { } else if (PARAMETER_CHECK("-split", 6, parameterLength)) { obeySplits = true; + } + else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { + uncompressedBam = true; } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; @@ -197,7 +200,7 @@ int main(int argc, char* argv[]) { BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, - reciprocalFraction, obeySplits, inputIsBam, outputIsBam); + reciprocalFraction, obeySplits, inputIsBam, outputIsBam, uncompressedBam); delete bi; return 0; } diff --git a/src/pairToBed/pairToBed.h b/src/pairToBed/pairToBed.h index 420426f8..68063c34 100644 --- a/src/pairToBed/pairToBed.h +++ b/src/pairToBed/pairToBed.h @@ -84,7 +84,7 @@ private: a.start1 = a.start2 = a.end1 = a.end2 = -1; a.chrom1 = a.chrom2 = "."; a.strand1 = a.strand2 = '.'; - uint8_t editDistance1, editDistance2; + uint32_t editDistance1, editDistance2; editDistance1 = editDistance2 = 0; // take the qname from end 1. @@ -101,7 +101,7 @@ private: // extract the edit distance from the NM tag // if possible. otherwise, complain. if (_useEditDistance == true) { - if (bam1.GetEditDistance(editDistance1) == false) { + if (bam1.GetTag("NM", editDistance1) == false) { cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; exit(1); } @@ -119,7 +119,7 @@ private: // extract the edit distance from the NM tag // if possible. otherwise, complain. if (_useEditDistance == true) { - if (bam2.GetEditDistance(editDistance2) == false) { + if (bam2.GetTag("NM", editDistance2) == false) { cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; exit(1); } diff --git a/src/utils/BamTools/BGZF.cpp b/src/utils/BamTools/BGZF.cpp index 9bc922a7..92afb96f 100644 --- a/src/utils/BamTools/BGZF.cpp +++ b/src/utils/BamTools/BGZF.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 July 2010 (DB) +// Last modified: 16 August 2010 (DB) // --------------------------------------------------------------------------- // BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -25,6 +25,7 @@ BgzfData::BgzfData(void) , BlockAddress(0) , IsOpen(false) , IsWriteOnly(false) + , IsWriteUncompressed(false) , Stream(NULL) , UncompressedBlock(NULL) , CompressedBlock(NULL) @@ -40,8 +41,8 @@ BgzfData::BgzfData(void) // destructor BgzfData::~BgzfData(void) { - if( CompressedBlock ) { delete[] CompressedBlock; } - if( UncompressedBlock ) { delete[] UncompressedBlock; } + if( CompressedBlock ) delete[] CompressedBlock; + if( UncompressedBlock ) delete[] UncompressedBlock; } // closes BGZF file @@ -61,6 +62,7 @@ void BgzfData::Close(void) { // flush and close fflush(Stream); fclose(Stream); + IsWriteUncompressed = false; IsOpen = false; } @@ -80,12 +82,15 @@ int BgzfData::DeflateBlock(void) { buffer[13] = BGZF_ID2; buffer[14] = BGZF_LEN; + // set compression level + const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION ); + // loop to retry for blocks that do not compress enough int inputLength = BlockOffset; int compressedLength = 0; unsigned int bufferSize = CompressedBlockSize; - while(true) { + while ( true ) { // initialize zstream values z_stream zs; @@ -97,21 +102,21 @@ int BgzfData::DeflateBlock(void) { zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; // initialize the zlib compression algorithm - if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) { + if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) { printf("BGZF ERROR: zlib deflate initialization failed.\n"); exit(1); } // compress the data int status = deflate(&zs, Z_FINISH); - if(status != Z_STREAM_END) { + if ( status != Z_STREAM_END ) { deflateEnd(&zs); // reduce the input length and try again - if(status == Z_OK) { + if ( status == Z_OK ) { inputLength -= 1024; - if(inputLength < 0) { + if( inputLength < 0 ) { printf("BGZF ERROR: input reduction failed.\n"); exit(1); } @@ -123,14 +128,14 @@ int BgzfData::DeflateBlock(void) { } // finalize the compression routine - if(deflateEnd(&zs) != Z_OK) { + if ( deflateEnd(&zs) != Z_OK ) { printf("BGZF ERROR: zlib::deflateEnd() failed.\n"); exit(1); } compressedLength = zs.total_out; compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if(compressedLength > MAX_BLOCK_SIZE) { + if ( compressedLength > MAX_BLOCK_SIZE ) { printf("BGZF ERROR: deflate overflow.\n"); exit(1); } @@ -149,8 +154,8 @@ int BgzfData::DeflateBlock(void) { // ensure that we have less than a block of data left int remaining = BlockOffset - inputLength; - if(remaining > 0) { - if(remaining > inputLength) { + if ( remaining > 0 ) { + if ( remaining > inputLength ) { printf("BGZF ERROR: after deflate, remainder too large.\n"); exit(1); } @@ -165,7 +170,7 @@ int BgzfData::DeflateBlock(void) { void BgzfData::FlushBlock(void) { // flush all of the remaining blocks - while(BlockOffset > 0) { + while ( BlockOffset > 0 ) { // compress the data block int blockLength = DeflateBlock(); @@ -173,7 +178,7 @@ void BgzfData::FlushBlock(void) { // flush the data to our output stream int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); - if(numBytesWritten != blockLength) { + if ( numBytesWritten != blockLength ) { printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); exit(1); } @@ -195,20 +200,20 @@ int BgzfData::InflateBlock(const int& blockLength) { zs.avail_out = UncompressedBlockSize; int status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if (status != Z_OK) { + if ( status != Z_OK ) { printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n"); return -1; } status = inflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { + if ( status != Z_STREAM_END ) { inflateEnd(&zs); printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n"); return -1; } status = inflateEnd(&zs); - if (status != Z_OK) { + if ( status != Z_OK ) { printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n"); return -1; } @@ -217,60 +222,59 @@ int BgzfData::InflateBlock(const int& blockLength) { } // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) -bool BgzfData::Open(const string& filename, const char* mode) { +bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) { - // determine open mode - if ( strcmp(mode, "rb") == 0 ) { + // determine open mode + if ( strcmp(mode, "rb") == 0 ) IsWriteOnly = false; - } else if ( strcmp(mode, "wb") == 0) { + else if ( strcmp(mode, "wb") == 0) IsWriteOnly = true; - } else { + else { printf("BGZF ERROR: unknown file mode: %s\n", mode); return false; } + // ---------------------------------------------------------------- // open Stream to read to/write from file, stdin, or stdout // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03) - if ( (filename != "stdin") && (filename != "stdout") ) { - // read/write BGZF data to/from a file -// Stream = fopen64(filename.c_str(), mode); + + // read/write BGZF data to/from a file + if ( (filename != "stdin") && (filename != "stdout") ) Stream = fopen(filename.c_str(), mode); - } - else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) { - // read BGZF data from stdin -// Stream = freopen64(NULL, mode, stdin); + + // read BGZF data from stdin + else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) Stream = freopen(NULL, mode, stdin); - } - else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) { - // write BGZF data to stdout -// Stream = freopen64(NULL, mode, stdout); + + // write BGZF data to stdout + else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) Stream = freopen(NULL, mode, stdout); - } - if(!Stream) { + if ( !Stream ) { printf("BGZF ERROR: unable to open file %s\n", filename.c_str() ); return false; } - // set flag, return success + // set flags, return success IsOpen = true; + IsWriteUncompressed = isWriteUncompressed; return true; } // reads BGZF data into a byte buffer int BgzfData::Read(char* data, const unsigned int dataLength) { - if (dataLength == 0) return 0; + if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0; char* output = data; unsigned int numBytesRead = 0; - while (numBytesRead < dataLength) { + while ( numBytesRead < dataLength ) { int bytesAvailable = BlockLength - BlockOffset; if ( bytesAvailable <= 0 ) { - if (!ReadBlock()) return -1; + if ( !ReadBlock() ) return -1; bytesAvailable = BlockLength - BlockOffset; - if (bytesAvailable <= 0) break; + if ( bytesAvailable <= 0 ) break; } char* buffer = UncompressedBlock; @@ -298,17 +302,17 @@ bool BgzfData::ReadBlock(void) { int64_t blockAddress = ftell64(Stream); int count = fread(header, 1, sizeof(header), Stream); - if (count == 0) { + if ( count == 0 ) { BlockLength = 0; return true; } - if (count != sizeof(header)) { + if ( count != sizeof(header) ) { printf("BGZF ERROR: read block failed - could not read block header\n"); return false; } - if (!BgzfData::CheckBlockHeader(header)) { + if ( !BgzfData::CheckBlockHeader(header) ) { printf("BGZF ERROR: read block failed - invalid block header\n"); return false; } @@ -319,13 +323,13 @@ bool BgzfData::ReadBlock(void) { int remaining = blockLength - BLOCK_HEADER_LENGTH; count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream); - if (count != remaining) { + if ( count != remaining ) { printf("BGZF ERROR: read block failed - could not read data from block\n"); return false; } count = InflateBlock(blockLength); - if (count < 0) { + if ( count < 0 ) { printf("BGZF ERROR: read block failed - could not decompress block data\n"); return false; } @@ -341,10 +345,12 @@ bool BgzfData::ReadBlock(void) { // seek to position in BGZF file bool BgzfData::Seek(int64_t position) { + if ( !IsOpen ) return false; + int blockOffset = (position & 0xFFFF); int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; - if (fseek64(Stream, blockAddress, SEEK_SET) != 0) { + if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { printf("BGZF ERROR: unable to seek in file\n"); return false; } @@ -357,19 +363,24 @@ bool BgzfData::Seek(int64_t position) { // get file position in BGZF file int64_t BgzfData::Tell(void) { - return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); + if ( !IsOpen ) + return false; + else + return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); } // writes the supplied data into the BGZF buffer unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { + if ( !IsOpen || !IsWriteOnly ) return false; + // initialize unsigned int numBytesWritten = 0; const char* input = data; unsigned int blockLength = UncompressedBlockSize; // copy the data to the buffer - while(numBytesWritten < dataLen) { + while ( numBytesWritten < dataLen ) { unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); char* buffer = UncompressedBlock; @@ -379,7 +390,7 @@ unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { input += copyLength; numBytesWritten += copyLength; - if(BlockOffset == blockLength) + if ( BlockOffset == blockLength ) FlushBlock(); } diff --git a/src/utils/BamTools/BGZF.h b/src/utils/BamTools/BGZF.h index 303684e1..37bcff75 100644 --- a/src/utils/BamTools/BGZF.h +++ b/src/utils/BamTools/BGZF.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 July 2010 (DB) +// Last modified: 16 August 2010 (DB) // --------------------------------------------------------------------------- // BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -77,81 +77,77 @@ const int DEFAULT_BLOCK_SIZE = 65536; struct BgzfData { - // --------------------------------- // data members - - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; - unsigned int BlockLength; - unsigned int BlockOffset; - uint64_t BlockAddress; - bool IsOpen; - bool IsWriteOnly; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; - - // --------------------------------- + public: + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + bool IsWriteOnly; + bool IsWriteUncompressed; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; + // constructor & destructor - - BgzfData(void); - ~BgzfData(void); + public: + BgzfData(void); + ~BgzfData(void); - // --------------------------------- // main interface methods - - // closes BGZF file - void Close(void); - // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) - bool Open(const std::string& filename, const char* mode); - // reads BGZF data into a byte buffer - int Read(char* data, const unsigned int dataLength); - // seek to position in BGZF file - bool Seek(int64_t position); - // get file position in BGZF file - int64_t Tell(void); - // writes the supplied data into the BGZF buffer - unsigned int Write(const char* data, const unsigned int dataLen); - - // --------------------------------- + public: + // closes BGZF file + void Close(void); + // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) + bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false); + // reads BGZF data into a byte buffer + int Read(char* data, const unsigned int dataLength); + // seek to position in BGZF file + bool Seek(int64_t position); + // get file position in BGZF file + int64_t Tell(void); + // writes the supplied data into the BGZF buffer + unsigned int Write(const char* data, const unsigned int dataLen); + // internal methods + private: + // compresses the current block + int DeflateBlock(void); + // flushes the data in the BGZF block + void FlushBlock(void); + // de-compresses the current block + int InflateBlock(const int& blockLength); + // reads a BGZF block + bool ReadBlock(void); - // compresses the current block - int DeflateBlock(void); - // flushes the data in the BGZF block - void FlushBlock(void); - // de-compresses the current block - int InflateBlock(const int& blockLength); - // reads a BGZF block - bool ReadBlock(void); - - // --------------------------------- // static 'utility' methods - - // checks BGZF block header - static inline bool CheckBlockHeader(char* header); - // packs an unsigned integer into the specified buffer - static inline void PackUnsignedInt(char* buffer, unsigned int value); - // packs an unsigned short into the specified buffer - static inline void PackUnsignedShort(char* buffer, unsigned short value); - // unpacks a buffer into a double - static inline double UnpackDouble(char* buffer); - static inline double UnpackDouble(const char* buffer); - // unpacks a buffer into a float - static inline float UnpackFloat(char* buffer); - static inline float UnpackFloat(const char* buffer); - // unpacks a buffer into a signed int - static inline signed int UnpackSignedInt(char* buffer); - static inline signed int UnpackSignedInt(const char* buffer); - // unpacks a buffer into a signed short - static inline signed short UnpackSignedShort(char* buffer); - static inline signed short UnpackSignedShort(const char* buffer); - // unpacks a buffer into an unsigned int - static inline unsigned int UnpackUnsignedInt(char* buffer); - static inline unsigned int UnpackUnsignedInt(const char* buffer); - // unpacks a buffer into an unsigned short - static inline unsigned short UnpackUnsignedShort(char* buffer); - static inline unsigned short UnpackUnsignedShort(const char* buffer); + public: + // checks BGZF block header + static inline bool CheckBlockHeader(char* header); + // packs an unsigned integer into the specified buffer + static inline void PackUnsignedInt(char* buffer, unsigned int value); + // packs an unsigned short into the specified buffer + static inline void PackUnsignedShort(char* buffer, unsigned short value); + // unpacks a buffer into a double + static inline double UnpackDouble(char* buffer); + static inline double UnpackDouble(const char* buffer); + // unpacks a buffer into a float + static inline float UnpackFloat(char* buffer); + static inline float UnpackFloat(const char* buffer); + // unpacks a buffer into a signed int + static inline signed int UnpackSignedInt(char* buffer); + static inline signed int UnpackSignedInt(const char* buffer); + // unpacks a buffer into a signed short + static inline signed short UnpackSignedShort(char* buffer); + static inline signed short UnpackSignedShort(const char* buffer); + // unpacks a buffer into an unsigned int + static inline unsigned int UnpackUnsignedInt(char* buffer); + static inline unsigned int UnpackUnsignedInt(const char* buffer); + // unpacks a buffer into an unsigned short + static inline unsigned short UnpackUnsignedShort(char* buffer); + static inline unsigned short UnpackUnsignedShort(const char* buffer); }; // ------------------------------------------------------------- diff --git a/src/utils/BamTools/BamAux.h b/src/utils/BamTools/BamAux.h index 6387dd33..73e88381 100644 --- a/src/utils/BamTools/BamAux.h +++ b/src/utils/BamTools/BamAux.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 July 2010 (DB) +// Last modified: 27 July 2010 (DB) // --------------------------------------------------------------------------- // Provides the basic constants, data structures, etc. for using BAM files // *************************************************************************** @@ -24,9 +24,6 @@ #include <utility> #include <vector> -#include <iostream> -#include <typeinfo> - // Platform-specific type definitions #ifndef BAMTOOLS_TYPES #define BAMTOOLS_TYPES @@ -106,44 +103,73 @@ struct BamAlignment { // Tag data access methods public: + // ------------------------------------------------------------------------------------- + // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched + // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in + // error message (to keep output clean) but will ALWAYS return false. Only user- + // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid. + + // add tag data (create new TAG entry with TYPE and VALUE) + // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details + // returns true if new data added, false if error or TAG already exists + // N.B. - will NOT modify existing tag. Use EditTag() instead + bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H + bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i + bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i + bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f + + // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) + // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details + // returns true if edit was successfaul, false if error + bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H + bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i + bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i + bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f + + // specific tag data access methods - these only remain for legacy support + bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance)) + bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) + // generic tag data access methods bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data bool GetTag(const std::string& tag, float& destination) const; // access floating point data - // specific tag data access methods - only remain here for legacy support - bool GetEditDistance(uint8_t& editDistance) const; // get "NM" tag data - contributed by Aaron Quinlan - bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data - + // remove tag data + // returns true if removal was successful, false if error + // N.B. - returns false if TAG does not exist (no removal can occur) + bool RemoveTag(const std::string& tag); // Additional data access methods public: - int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations + int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations // 'internal' utility methods private: + static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); // Data members public: - std::string Name; // Read name - int32_t Length; // Query length - std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) - std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) - std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) - std::string TagData; // Tag data (accessor methods will pull the requested information out) - int32_t RefID; // ID number for reference sequence - int32_t Position; // Position (0-based) where alignment starts - uint16_t Bin; // Bin in BAM file where this alignment resides - uint16_t MapQuality; // Mapping quality score - uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate + std::string Name; // Read name + int32_t Length; // Query length + std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) + std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) + std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) + std::string TagData; // Tag data (accessor methods will pull the requested information out) + int32_t RefID; // ID number for reference sequence + int32_t Position; // Position (0-based) where alignment starts + uint16_t Bin; // Bin in BAM file where this alignment resides + uint16_t MapQuality; // Mapping quality score + uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate std::vector<CigarOp> CigarData; // CIGAR operations for this alignment - int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned - int32_t MatePosition; // Position (0-based) where alignment's mate starts - int32_t InsertSize; // Mate-pair insert size - + int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned + int32_t MatePosition; // Position (0-based) where alignment's mate starts + int32_t InsertSize; // Mate-pair insert size + // internal data + private: struct BamAlignmentSupportData { // data members @@ -164,7 +190,13 @@ struct BamAlignment { { } }; - BamAlignmentSupportData SupportData; // Contains raw character data & lengths + // contains raw character data & lengths + BamAlignmentSupportData SupportData; + + // allow these classes access to BamAlignment private members (SupportData) + // but client code should not need to touch this data + friend class BamReader; + friend class BamWriter; // Alignment flag query constants // Use the get/set methods above instead @@ -238,15 +270,93 @@ struct BamRegion { { } }; +// ---------------------------------------------------------------- +// Added: 3-35-2010 DWB +// Fixed: Routines to provide endian-correctness +// ---------------------------------------------------------------- + +// returns true if system is big endian +inline bool SystemIsBigEndian(void) { + const uint16_t one = 0x0001; + return ((*(char*) &one) == 0 ); +} + +// swaps endianness of 16-bit value 'in place' +inline void SwapEndian_16(int16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +inline void SwapEndian_16(uint16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +// swaps endianness of 32-bit value 'in-place' +inline void SwapEndian_32(int32_t& x) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +inline void SwapEndian_32(uint32_t& x) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +// swaps endianness of 64-bit value 'in-place' +inline void SwapEndian_64(int64_t& x) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000ll) | + ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | + ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | + ((x >> 40) & 0x000000000000FF00ll) | + (x << 56) + ); +} + +inline void SwapEndian_64(uint64_t& x) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000ll) | + ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | + ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | + ((x >> 40) & 0x000000000000FF00ll) | + (x << 56) + ); +} + +// swaps endianness of 'next 2 bytes' in a char buffer (in-place) +inline void SwapEndian_16p(char* data) { + uint16_t& value = (uint16_t&)*data; + SwapEndian_16(value); +} + +// swaps endianness of 'next 4 bytes' in a char buffer (in-place) +inline void SwapEndian_32p(char* data) { + uint32_t& value = (uint32_t&)*data; + SwapEndian_32(value); +} + +// swaps endianness of 'next 8 bytes' in a char buffer (in-place) +inline void SwapEndian_64p(char* data) { + uint64_t& value = (uint64_t&)*data; + SwapEndian_64(value); +} + // ---------------------------------------------------------------- // BamAlignment member methods // constructors & destructor -inline -BamAlignment::BamAlignment(void) { } +inline BamAlignment::BamAlignment(void) { } -inline -BamAlignment::BamAlignment(const BamAlignment& other) +inline BamAlignment::BamAlignment(const BamAlignment& other) : Name(other.Name) , Length(other.Length) , QueryBases(other.QueryBases) @@ -265,8 +375,7 @@ BamAlignment::BamAlignment(const BamAlignment& other) , SupportData(other.SupportData) { } -inline -BamAlignment::~BamAlignment(void) { } +inline BamAlignment::~BamAlignment(void) { } // Queries against alignment flags inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } @@ -316,208 +425,381 @@ int BamAlignment::GetEndPosition(bool usePadded) const { return alignEnd; } -// get "NM" tag data - contributed by Aaron Quinlan -// stores data in 'editDistance', returns success/fail -inline -bool BamAlignment::GetEditDistance(uint8_t& editDistance) const { - - // make sure tag data exists - if ( TagData.empty() ) return false; - +inline +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type != "Z" && type != "H" ) return false; + // localize the tag data char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); + const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, copy tag data to temp buffer + std::string newTag = tag + type + value; + const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} - bool foundEditDistanceTag = false; - while ( numBytesParsed < tagDataLen ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag - if ( strncmp(pTagType, "NM", 2) == 0 ) { - foundEditDistanceTag = true; - break; - } +inline +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "f" || type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, convert value to string + union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + un.value = value; + + // copy original tag data to temp buffer + std::string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; - } - // return if the edit distance tag was not present - if ( !foundEditDistanceTag ) return false; +inline +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { + return AddTag(tag, type, (const uint32_t&)value); +} - // assign the editDistance value - std::memcpy(&editDistance, pTagData, 1); +inline +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, convert value to string + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + + // copy original tag data to temp buffer + std::string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success return true; } -// get "RG" tag data -// stores data in 'readGroup', returns success/fail -inline -bool BamAlignment::GetReadGroup(std::string& readGroup) const { +inline +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type != "Z" && type != "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + value.size()]; - // make sure tag data exists - if ( TagData.empty() ) return false; + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + const unsigned int dataLength = strlen(value.c_str()); + memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); +inline +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "f" || type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; - bool foundReadGroupTag = false; - while( numBytesParsed < tagDataLen ) { + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; +inline +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { + return EditTag(tag, type, (const uint32_t&)value); +} - // check the current tag - if ( std::strncmp(pTagType, "RG", 2) == 0 ) { - foundReadGroupTag = true; - break; - } +inline +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} - // return if the read group tag was not present - if ( !foundReadGroupTag ) return false; +// get "NM" tag data - originally contributed by Aaron Quinlan +// stores data in 'editDistance', returns success/fail +inline +bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { + return GetTag("NM", (uint32_t&)editDistance); +} - // assign the read group - const unsigned int readGroupLen = std::strlen(pTagData); - readGroup.resize(readGroupLen); - std::memcpy( (char*)readGroup.data(), pTagData, readGroupLen ); - return true; +// get "RG" tag data +// stores data in 'readGroup', returns success/fail +inline +bool BamAlignment::GetReadGroup(std::string& readGroup) const { + return GetTag("RG", readGroup); } inline bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { - + // make sure tag data exists - if ( TagData.empty() ) return false; + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; // localize the tag data char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); + const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - - bool foundReadGroupTag = false; - while ( numBytesParsed < tagDataLen ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag - if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) { - foundReadGroupTag = true; - break; - } - - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + const unsigned int dataLength = strlen(pTagData); + destination.clear(); + destination.resize(dataLength); + memcpy( (char*)destination.data(), pTagData, dataLength ); + return true; } - - // return if the read group tag was not present - if ( !foundReadGroupTag ) return false; - - // assign the read group - const unsigned int dataLen = std::strlen(pTagData); - destination.resize(dataLen); - std::memcpy( (char*)destination.data(), pTagData, dataLen ); - return true; + + // tag not found, return failure + return false; } inline bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { - // make sure data exists - if ( TagData.empty() ) return false; - - // clear out destination - destination = 0; + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; // localize the tag data char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); + const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - - int destinationLength = 0; - bool foundDesiredTag = false; - while ( numBytesParsed < tagDataLen ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag - if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) { - - // determine actual length of data depending on tag type - // this is necessary because some tags may be of variable byte-lengths (i.e. char or short) - const char type = *pTagStorageType; - switch(type) { - - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type for integer destination (float & var-length strings) - case 'f': - case 'Z': - case 'H': - printf("ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); - return false; - } - - foundDesiredTag = true; - break; + + // if tag found, determine data byte-length, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch (type) { + // 1 byte data + case 'A': + case 'c': + case 'C': + destinationLength = 1; + break; + + // 2 byte data + case 's': + case 'S': + destinationLength = 2; + break; + + // 4 byte data + case 'i': + case 'I': + destinationLength = 4; + break; + + // unsupported type for integer destination (float or var-length strings) + case 'f': + case 'Z': + case 'H': + printf("ERROR: Cannot store tag of type %c in integer destination\n", type); + return false; + + // unknown tag type + default: + printf("ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; } - - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; + + // store in destination + destination = 0; + memcpy(&destination, pTagData, destinationLength); + return true; } - // return if the edit distance tag was not present - if ( !foundDesiredTag ) return false; - - // assign the editDistance value - std::memcpy(&destination, pTagData, destinationLength); - return true; + + // tag not found, return failure + return false; } inline @@ -528,81 +810,134 @@ bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { inline bool BamAlignment::GetTag(const std::string& tag, float& destination) const { - // make sure data exists - if ( TagData.empty() ) return false; - - // clear out destination - destination = 0.0; + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; // localize the tag data char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); + const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; + + // if tag found, determine data byte-length, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + //pTagData += numBytesParsed; + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch(type) { + + // 1 byte data + case 'A': + case 'c': + case 'C': + destinationLength = 1; + break; + + // 2 byte data + case 's': + case 'S': + destinationLength = 2; + break; + + // 4 byte data + case 'f': + case 'i': + case 'I': + destinationLength = 4; + break; + + // unsupported type (var-length strings) + case 'Z': + case 'H': + printf("ERROR: Cannot store tag of type %c in integer destination\n", type); + return false; + + // unknown tag type + default: + printf("ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + + // store in destination + destination = 0.0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} - int destinationLength = 0; - bool foundDesiredTag = false; - while( numBytesParsed < tagDataLen ) { +inline +bool BamAlignment::RemoveTag(const std::string& tag) { + + // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed + // also, return false if no data present to remove + if ( SupportData.HasCoreOnly || TagData.empty() ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + char newTagData[originalTagDataLength]; + + // copy original tag data up til desired tag + pTagData -= 3; + numBytesParsed -= 3; + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); + + // save new tag data + TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); + return true; + } + + // tag not found, no removal - return failure + return false; +} - const char* pTagType = pTagData; +inline +bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) { + + while ( numBytesParsed < tagDataLength ) { + + const char* pTagType = pTagData; const char* pTagStorageType = pTagData + 2; pTagData += 3; numBytesParsed += 3; - // check the current tag - if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) { - - // determine actual length of data depending on tag type - // this is necessary because some tags may be of variable byte-lengths (i.e. char or short) - const char type = *pTagStorageType; - switch(type) { - - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'f': - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type (var-length strings) - case 'Z': - case 'H': - printf("ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); - return false; - } - - foundDesiredTag = true; - break; - } + // check the current tag, return true on match + if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) + return true; // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; + if ( *pTagStorageType == '\0' ) return false; if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; if ( *pTagData == '\0' ) return false; } - // return if the edit distance tag was not present - if ( !foundDesiredTag ) return false; - - // assign the editDistance value - std::memcpy(&destination, pTagData, destinationLength); - return true; + + // checked all tags, none match + return false; } inline @@ -636,17 +971,14 @@ bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsign ++numBytesParsed; ++pTagData; } - // --------------------------- - // Added: 3-25-2010 DWB - // Contributed: ARQ - // Fixed: error parsing variable length tag data + // increment for null-terminator + ++numBytesParsed; ++pTagData; - // --------------------------- break; default: // error case - printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); + printf("ERROR: Unknown tag storage class encountered: [%c]\n", storageType); return false; } @@ -654,86 +986,6 @@ bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsign return true; } -// ---------------------------------------------------------------- -// Added: 3-35-2010 DWB -// Fixed: Routines to provide endian-correctness -// ---------------------------------------------------------------- - -// returns true if system is big endian -inline bool SystemIsBigEndian(void) { - const uint16_t one = 0x0001; - return ((*(char*) &one) == 0 ); -} - -// swaps endianness of 16-bit value 'in place' -inline void SwapEndian_16(int16_t& x) { - x = ((x >> 8) | (x << 8)); -} - -inline void SwapEndian_16(uint16_t& x) { - x = ((x >> 8) | (x << 8)); -} - -// swaps endianness of 32-bit value 'in-place' -inline void SwapEndian_32(int32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | - (x << 24) - ); -} - -inline void SwapEndian_32(uint32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | - (x << 24) - ); -} - -// swaps endianness of 64-bit value 'in-place' -inline void SwapEndian_64(int64_t& x) { - x = ( (x >> 56) | - ((x << 40) & 0x00FF000000000000ll) | - ((x << 24) & 0x0000FF0000000000ll) | - ((x << 8) & 0x000000FF00000000ll) | - ((x >> 8) & 0x00000000FF000000ll) | - ((x >> 24) & 0x0000000000FF0000ll) | - ((x >> 40) & 0x000000000000FF00ll) | - (x << 56) - ); -} - -inline void SwapEndian_64(uint64_t& x) { - x = ( (x >> 56) | - ((x << 40) & 0x00FF000000000000ll) | - ((x << 24) & 0x0000FF0000000000ll) | - ((x << 8) & 0x000000FF00000000ll) | - ((x >> 8) & 0x00000000FF000000ll) | - ((x >> 24) & 0x0000000000FF0000ll) | - ((x >> 40) & 0x000000000000FF00ll) | - (x << 56) - ); -} - -// swaps endianness of 'next 2 bytes' in a char buffer (in-place) -inline void SwapEndian_16p(char* data) { - uint16_t& value = (uint16_t&)*data; - SwapEndian_16(value); -} - -// swaps endianness of 'next 4 bytes' in a char buffer (in-place) -inline void SwapEndian_32p(char* data) { - uint32_t& value = (uint32_t&)*data; - SwapEndian_32(value); -} - -// swaps endianness of 'next 8 bytes' in a char buffer (in-place) -inline void SwapEndian_64p(char* data) { - uint64_t& value = (uint64_t&)*data; - SwapEndian_64(value); -} - } // namespace BamTools #endif // BAMAUX_H diff --git a/src/utils/BamTools/BamIndex.cpp b/src/utils/BamTools/BamIndex.cpp index 787995b7..d74e751c 100644 --- a/src/utils/BamTools/BamIndex.cpp +++ b/src/utils/BamTools/BamIndex.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 July 2010 (DB) +// Last modified: 17 August 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -13,7 +13,7 @@ #include <cstdio> #include <cstdlib> #include <algorithm> -#include <iostream> +// #include <iostream> #include <map> #include "BamIndex.h" #include "BamReader.h" @@ -52,6 +52,8 @@ bool BamIndex::HasAlignments(const int& referenceID) { namespace BamTools { +// -------------------------------------------------- +// BamDefaultIndex data structures & typedefs struct Chunk { // data members @@ -88,7 +90,7 @@ struct ReferenceIndex { { } }; -typedef vector<ReferenceIndex> BamDefaultIndexData; +typedef vector<ReferenceIndex> BamDefaultIndexData; } // namespace BamTools diff --git a/src/utils/BamTools/BamIndex.h b/src/utils/BamTools/BamIndex.h index aeecefb7..b9ce7d03 100644 --- a/src/utils/BamTools/BamIndex.h +++ b/src/utils/BamTools/BamIndex.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 July 2010 (DB) +// Last modified: 17 August 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -22,6 +22,7 @@ namespace BamTools { class BamReader; class BgzfData; +// -------------------------------------------------- // BamIndex base class class BamIndex { @@ -51,6 +52,7 @@ class BamIndex { bool m_isBigEndian; }; +// -------------------------------------------------- // BamDefaultIndex class // // implements default (per SAM/BAM spec) index file ops @@ -82,6 +84,7 @@ class BamDefaultIndex : public BamIndex { BamDefaultIndexPrivate* d; }; +// -------------------------------------------------- // BamToolsIndex class // // implements BamTools-specific index file ops diff --git a/src/utils/BamTools/BamMultiReader.cpp b/src/utils/BamTools/BamMultiReader.cpp index 001df075..005b0b0a 100644 --- a/src/utils/BamTools/BamMultiReader.cpp +++ b/src/utils/BamTools/BamMultiReader.cpp @@ -286,7 +286,7 @@ bool BamMultiReader::Rewind(void) { return result; } -// saves index data to BAM index files (".bai") where necessary, returns success/fail +// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail bool BamMultiReader::CreateIndexes(bool useDefaultIndex) { bool result = true; for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { diff --git a/src/utils/BamTools/BamReader.cpp b/src/utils/BamTools/BamReader.cpp index be6b72b5..b1b1b5de 100644 --- a/src/utils/BamTools/BamReader.cpp +++ b/src/utils/BamTools/BamReader.cpp @@ -540,8 +540,10 @@ bool BamReader::BamReaderPrivate::Jump(int refID, int position) { // if this alignment corresponds to desired position // return success of seeking back to 'current offset' - if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) + if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) { + if ( o != offsets.begin() ) --o; return mBGZF.Seek(*o); + } } return result; diff --git a/src/utils/BamTools/BamWriter.cpp b/src/utils/BamTools/BamWriter.cpp index a794dff5..f83ff1c3 100644 --- a/src/utils/BamTools/BamWriter.cpp +++ b/src/utils/BamTools/BamWriter.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 15 July 2010 (DB) +// Last modified: 17 August 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -35,7 +35,7 @@ struct BamWriter::BamWriterPrivate { // "public" interface void Close(void); - void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences); + bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed); void SaveAlignment(const BamAlignment& al); // internal methods @@ -65,8 +65,8 @@ void BamWriter::Close(void) { } // opens the alignment archive -void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) { - d->Open(filename, samHeader, referenceSequences); +bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) { + return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed); } // saves the alignment to the alignment archive @@ -202,10 +202,11 @@ void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, strin } // opens the alignment archive -void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) { +bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) { - // open the BGZF file for writing - mBGZF.Open(filename, "wb"); + // open the BGZF file for writing, return failure if error + if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) + return false; // ================ // write the header @@ -250,6 +251,9 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam if (IsBigEndian) SwapEndian_32(referenceLength); mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); } + + // return success + return true; } // saves the alignment to the alignment archive diff --git a/src/utils/BamTools/BamWriter.h b/src/utils/BamTools/BamWriter.h index 2608ddbe..b0cb6ceb 100644 --- a/src/utils/BamTools/BamWriter.h +++ b/src/utils/BamTools/BamWriter.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 8 December 2009 (DB) +// Last modified: 17 August 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -34,7 +34,10 @@ class BamWriter { // closes the alignment archive void Close(void); // opens the alignment archive - void Open(const std::string& filename, const std::string& samHeader, const BamTools::RefVector& referenceSequences); + bool Open(const std::string& filename, + const std::string& samHeader, + const BamTools::RefVector& referenceSequences, + bool writeUncompressed = false); // saves the alignment to the alignment archive void SaveAlignment(const BamTools::BamAlignment& al); diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 7610a15a..3560628b 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -426,6 +426,47 @@ void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool forceStrand) { } +void BedFile::countListHits(const BED &a, int index, bool forceStrand) { + + BIN startBin, endBin; + startBin = (a.start >> _binFirstShift); + endBin = ((a.end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BEDCOVLIST>::iterator bedItr = bedCovListMap[a.chrom][j].begin(); + vector<BEDCOVLIST>::iterator bedEnd = bedCovListMap[a.chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + + // skip the hit if not on the same strand (and we care) + if (forceStrand && (a.strand != bedItr->strand)) { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) { + bedItr->counts[index]++; + bedItr->depthMapList[index][a.start+1].starts++; + bedItr->depthMapList[index][a.end].ends++; + + if (a.start < bedItr->minOverlapStarts[index]) { + bedItr->minOverlapStarts[index] = a.start; + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } +} + + void BedFile::setGff (bool gff) { if (gff == true) this->_isGff = true; else this->_isGff = false; @@ -496,6 +537,33 @@ void BedFile::loadBedCovFileIntoMap() { Close(); } +void BedFile::loadBedCovListFileIntoMap() { + + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BIN bin = getBin(bedEntry.start, bedEntry.end); + + BEDCOVLIST bedCovList; + bedCovList.chrom = bedEntry.chrom; + bedCovList.start = bedEntry.start; + bedCovList.end = bedEntry.end; + bedCovList.name = bedEntry.name; + bedCovList.score = bedEntry.score; + bedCovList.strand = bedEntry.strand; + bedCovList.otherFields = bedEntry.otherFields; + + bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList); + bedEntry = nullBed; + } + } + Close(); +} + void BedFile::loadBedFileIntoMapNoBin() { diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 4d2db0fb..d9cc9c15 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -169,6 +169,31 @@ struct BEDCOV { }; +/* + Structure for BED COVERAGE records having lists of + multiple coverages +*/ +struct BEDCOVLIST { + + // Regular BED fields + CHRPOS start; + CHRPOS end; + + string chrom; + string name; + string score; + string strand; + + // Add'l fields for BED12 and/or custom BED annotations + vector<string> otherFields; + + // Additional fields specific to computing coverage + vector< map<unsigned int, DEPTH> > depthMapList; + vector<unsigned int> counts; + vector<CHRPOS> minOverlapStarts; +}; + + // enum to flag the state of a given line in a BED file. enum BedLineStatus { @@ -191,12 +216,15 @@ enum FileType //************************************************* typedef vector<BED> bedVector; typedef vector<BEDCOV> bedCovVector; +typedef vector<BEDCOVLIST> bedCovListVector; typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; +typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists; typedef map<string, binsToBeds, std::less<string> > masterBedMap; typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; +typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap; typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; @@ -291,6 +319,9 @@ public: // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs void loadBedCovFileIntoMap(); + + // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs + void loadBedCovListFileIntoMap(); // load a BED file into a map keyed by chrom. value is vector of BEDs void loadBedFileIntoMapNoBin(); @@ -320,6 +351,11 @@ public: // if one read has four block, we only want to count the coverage as // coming from one read, not four. void countSplitHits(const vector<BED> &bedBlock, bool forceStrand); + + // Given a chrom, start, end and strand for a single feature, + // increment a the number of hits for each feature in B file + // that the feature overlaps + void countListHits(const BED &a, int index, bool forceStrand); // the bedfile with which this instance is associated string bedFile; @@ -327,9 +363,10 @@ public: // 9 for GFF // Main data structires used by BEDTools - masterBedCovMap bedCovMap; - masterBedMap bedMap; - masterBedMapNoBin bedMapNoBin; + masterBedCovMap bedCovMap; + masterBedCovListMap bedCovListMap; + masterBedMap bedMap; + masterBedMapNoBin bedMapNoBin; private: -- GitLab