From ff07e83c7a0b5f25fa9dd7d846f4b92f718214f9 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Tue, 29 Jun 2010 10:02:39 -0400 Subject: [PATCH] Large commit 1. Reset BED.strand to be string instead of char in the interest of generality. 2. Fixed bug in genomeCoverageBed where NULL entries where genome summaries were err. reported with -bga. 3. Fixed bug in closestBed where NULL entries were err. reported when BED > 6. 4. Standardized the Open, Get, Close idiom per improvements from Gordon Assaf. 5. Add "split" logic to bamToBed. --- .cproject | 0 .project | 0 LICENSE | 0 RELEASE_HISTORY | 0 data/knownGene.hg18.chr21.bed | 0 genomes/human.hg18.genome | 0 genomes/human.hg19.genome | 0 genomes/mouse.mm8.genome | 0 genomes/mouse.mm9.genome | 0 src/bamFillMateSeq/Makefile | 45 --- src/bamFillMateSeq/bamFillMateSeq.cpp | 294 -------------------- src/bamToBed/Makefile | 4 +- src/bamToBed/bamToBed.cpp | 76 +++-- src/bedToBam/Makefile | 4 +- src/bedToBam/bedToBam.cpp | 11 +- src/closestBed/closestBed.cpp | 36 +-- src/complementBed/complementBed.cpp | 27 +- src/fastaFromBed/fastaFromBed.cpp | 2 +- src/genomeCoverageBed/genomeCoverageBed.cpp | 2 +- src/linksBed/linksBed.cpp | 5 +- src/mergeBed/mergeBed.cpp | 13 +- src/pairToBed/pairToBed.cpp | 10 +- src/pairToBed/pairToBed.h | 12 +- src/pairToPair/pairToPair.cpp | 47 ++-- src/pairToPair/pairToPair.h | 12 +- src/shuffleBed/shuffleBed.cpp | 132 ++++----- src/slopBed/slopBed.cpp | 2 +- src/subtractBed/subtractBed.cpp | 10 +- src/utils/BamTools/BGZF.cpp | 0 src/utils/BamTools/BGZF.h | 0 src/utils/BamTools/BamAncillary.cpp | 65 +++++ src/utils/BamTools/BamAncillary.h | 18 ++ src/utils/BamTools/BamAux.h | 0 src/utils/BamTools/BamReader.cpp | 6 +- src/utils/BamTools/BamReader.h | 0 src/utils/BamTools/BamWriter.cpp | 0 src/utils/BamTools/BamWriter.h | 0 src/utils/BamTools/Makefile | 5 +- src/utils/bedFile/bedFile.cpp | 96 +++---- src/utils/bedFile/bedFile.h | 222 +++++++++++---- src/utils/bedFilePE/bedFilePE.cpp | 16 +- src/utils/bedFilePE/bedFilePE.h | 18 +- src/utils/gzstream/COPYING.LIB | 0 src/utils/gzstream/Makefile | 0 src/utils/gzstream/README | 0 src/utils/gzstream/gzstream.C | 0 src/utils/gzstream/gzstream.h | 0 src/utils/gzstream/gzstream.o | Bin src/utils/gzstream/test_gunzip.o | Bin src/utils/gzstream/test_gzip.o | Bin src/utils/gzstream/version | 0 src/utils/version/version.h | 0 src/windowBed/windowBed.cpp | 20 +- src/windowBed/windowBed.h | 2 +- 54 files changed, 532 insertions(+), 680 deletions(-) mode change 100644 => 100755 .cproject mode change 100644 => 100755 .project mode change 100644 => 100755 LICENSE mode change 100644 => 100755 RELEASE_HISTORY mode change 100644 => 100755 data/knownGene.hg18.chr21.bed mode change 100644 => 100755 genomes/human.hg18.genome mode change 100644 => 100755 genomes/human.hg19.genome mode change 100644 => 100755 genomes/mouse.mm8.genome mode change 100644 => 100755 genomes/mouse.mm9.genome delete mode 100755 src/bamFillMateSeq/Makefile delete mode 100755 src/bamFillMateSeq/bamFillMateSeq.cpp mode change 100644 => 100755 src/utils/BamTools/BGZF.cpp mode change 100644 => 100755 src/utils/BamTools/BGZF.h create mode 100755 src/utils/BamTools/BamAncillary.cpp create mode 100755 src/utils/BamTools/BamAncillary.h mode change 100644 => 100755 src/utils/BamTools/BamAux.h mode change 100644 => 100755 src/utils/BamTools/BamReader.cpp mode change 100644 => 100755 src/utils/BamTools/BamReader.h mode change 100644 => 100755 src/utils/BamTools/BamWriter.cpp mode change 100644 => 100755 src/utils/BamTools/BamWriter.h mode change 100644 => 100755 src/utils/BamTools/Makefile mode change 100644 => 100755 src/utils/gzstream/COPYING.LIB mode change 100644 => 100755 src/utils/gzstream/Makefile mode change 100644 => 100755 src/utils/gzstream/README mode change 100644 => 100755 src/utils/gzstream/gzstream.C mode change 100644 => 100755 src/utils/gzstream/gzstream.h mode change 100644 => 100755 src/utils/gzstream/gzstream.o mode change 100644 => 100755 src/utils/gzstream/test_gunzip.o mode change 100644 => 100755 src/utils/gzstream/test_gzip.o mode change 100644 => 100755 src/utils/gzstream/version mode change 100644 => 100755 src/utils/version/version.h diff --git a/.cproject b/.cproject old mode 100644 new mode 100755 diff --git a/.project b/.project old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/RELEASE_HISTORY b/RELEASE_HISTORY old mode 100644 new mode 100755 diff --git a/data/knownGene.hg18.chr21.bed b/data/knownGene.hg18.chr21.bed old mode 100644 new mode 100755 diff --git a/genomes/human.hg18.genome b/genomes/human.hg18.genome old mode 100644 new mode 100755 diff --git a/genomes/human.hg19.genome b/genomes/human.hg19.genome old mode 100644 new mode 100755 diff --git a/genomes/mouse.mm8.genome b/genomes/mouse.mm8.genome old mode 100644 new mode 100755 diff --git a/genomes/mouse.mm9.genome b/genomes/mouse.mm9.genome old mode 100644 new mode 100755 diff --git a/src/bamFillMateSeq/Makefile b/src/bamFillMateSeq/Makefile deleted file mode 100755 index da902162..00000000 --- a/src/bamFillMateSeq/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -CXX= g++ -CXXFLAGS= -Wall -g -LIBS= -lz -UTILITIES_DIR = ../utils/ -OBJ_DIR = ../../obj/ -BIN_DIR = ../../bin/ - -# ------------------- -# define our includes -# ------------------- -INCLUDES = -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/sequenceUtilities/ -I$(UTILITIES_DIR)/version/ - -# ---------------------------------- -# define our source and object files -# ---------------------------------- -SOURCES= bamFillMateSeq.cpp -OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=BamReader.o BamWriter.o sequenceUtils.o BGZF.o -EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) -BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) -PROGRAM= bamFillMateSeq - - -all: $(PROGRAM) - -.PHONY: all - - -$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) - @echo " * linking $(PROGRAM)" - @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) - -$(BUILT_OBJECTS): $(SOURCES) - @echo " * compiling" $(*F).cpp - @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) - -$(EXT_OBJECTS): - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/sequenceUtilities/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ - -clean: - @echo "Cleaning up." - @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* - -.PHONY: clean diff --git a/src/bamFillMateSeq/bamFillMateSeq.cpp b/src/bamFillMateSeq/bamFillMateSeq.cpp deleted file mode 100755 index f691ecf2..00000000 --- a/src/bamFillMateSeq/bamFillMateSeq.cpp +++ /dev/null @@ -1,294 +0,0 @@ -/***************************************************************************** - bamToBed.cpp - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0+ license. -******************************************************************************/ -#include "version.h" -#include "sequenceUtils.h" -#include "BamReader.h" -#include "BamWriter.h" -#include "BamAux.h" -using namespace BamTools; - -#include <vector> -#include <iostream> -#include <fstream> -#include <stdlib.h> - -using namespace std; - - -// define our program name -#define PROGRAM_NAME "bamFillMateSeq" - -// define our parameter checking macro -#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) - -// function declarations -void ShowHelp(void); -void GetSeqsAndQuals(const vector<BamAlignment> &alignments, - string &seq1, - string &qual1, - string &seq, - string &qual2); -void FillMateSeqForReadBlock (const vector<BamAlignment> &alignments, - const RefVector &refs, - BamWriter &writer, - const string &seq1, - const string &qual1, - const string &seq, - const string &qual2); -int strnum_cmp(const char *a, const char *b); - - -int main(int argc, char* argv[]) { - - // our configuration variables - bool showHelp = false; - bool haveBam = false; - - // input files - string bamFile; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - haveBam = true; - bamFile = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have an input files - if (!haveBam ) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - - // open the BAM file - BamReader reader; - reader.Open(bamFile); - BamWriter writer; - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // open our BAM writer - writer.Open("stdout", header, refs); - - // track the previous and current sequence - // names so that we can identify blocks of - // alignments for a given read ID. - string prevName = ""; - string currName = ""; - - vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. - alignments.reserve(100); - - BamAlignment bam; - string seq1, qual1, seq2, qual2; - // get each set of alignments for each pair. - while (reader.GetNextAlignment(bam)) { - - currName = bam.Name; - //printf("%s\t%s\n", currName.c_str(), prevName.c_str()); - if ( (strcmp(currName.c_str(), prevName.c_str()) != 0) && (prevName != "") ) { - //printf("yep\n"); - // block change. enforce expected sort order. - if (strnum_cmp(currName.c_str(), prevName.c_str()) == -1) { - cerr << "Error: The BAM file (" << bamFile << ") is not sorted by sequence name. Exiting!" << endl; - exit (1); - } - else { - // process the current block of alignments - GetSeqsAndQuals(alignments, seq1, qual1, seq2, qual2); - FillMateSeqForReadBlock(alignments, refs, writer, seq1, qual1, seq2, qual2); - - // reset the alignment vector for the next block - // and add the first alignment in the next block - alignments.clear(); - alignments.push_back(bam); - } - } - else { - alignments.push_back(bam); - } - prevName = currName; - } - // process the last block - GetSeqsAndQuals(alignments, seq1, qual1, seq2, qual2); - FillMateSeqForReadBlock(alignments, refs, writer, seq1, qual1, seq2, qual2); - - // close up - reader.Close(); - writer.Close(); - } - else { - ShowHelp(); - } -} - - -void GetSeqsAndQuals(const vector<BamAlignment> &alignments, - string &seq1, - string &qual1, - string &seq2, - string &qual2) { - - bool haveSeq1, haveSeq2, haveQual1, haveQual2; - haveSeq1 = haveSeq2 = haveQual1 = haveQual2 = false; - - // extract the sequences for the first and second mates - - vector<BamAlignment>::const_iterator alItr = alignments.begin(); - vector<BamAlignment>::const_iterator alEnd = alignments.end(); - while ( (alItr != alEnd) && (haveSeq1 == false || haveSeq2 == false || - haveQual1 == false || haveQual2 == false)) { - - if ( alItr->IsFirstMate() ) { - seq1 = alItr->QueryBases; - qual1 = alItr->Qualities; - if ( alItr->IsReverseStrand() ) { - reverseComplement(seq1); - reverseSequence(qual1); - } - haveSeq1 = true; - haveQual1 = true; - } - else if ( alItr->IsSecondMate() ) { - seq2 = alItr->QueryBases; - qual2 = alItr->Qualities; - if ( alItr->IsReverseStrand() ) { - reverseComplement(seq2); - reverseSequence(qual2); - } - haveSeq2 = true; - haveQual2 = true; - } - ++alItr; - } - - //cerr << alignments.size() << endl; - cout << alignments[0].Qualities << endl; - //seq1 = alignments[0].QueryBases; - //qual1 = alignments[0].Qualities; - //seq2 = alignments[1].QueryBases; - //qual2 = alignments[1].Qualities; - seq1 = "seq1\0"; - seq2 = "seq2\0"; - qual1 = "88###########################################################################88"; - qual2 = "77###########################################################################77"; - - // IT MUST have to do with modifying the alignment itself. -} - - -void FillMateSeqForReadBlock (const vector<BamAlignment> &alignments, - const RefVector &refs, - BamWriter &writer, - const string &seq1, - const string &qual1, - const string &seq2, - const string &qual2) { - // now update each alignment with the appropriate - // sequence for it's mate - cerr << "call\n"; - string joe = qual1; - vector<BamAlignment>::const_iterator aItr = alignments.begin(); - vector<BamAlignment>::const_iterator aEnd = alignments.end(); - int i = 0; - for (; aItr != aEnd; ++aItr) { - - cerr << i << " " << aItr->Name << " [" << qual1 << "]" << "\t[" << qual2 << "]" << "\t" << joe << endl; - //if ( aItr->IsFirstMate() ) { - //aItr->AddBamTag("R2", "Z", seq2); - //aItr->AddBamTag("Q2", "Z", qual2); - //} - //else if ( aItr->IsSecondMate() ) { - //aItr->AddBamTag("R2", "Z", seq1); - //aItr->AddBamTag("Q2", "Z", qual1); - //} - //BamAlignment poop = *aItr; - writer.SaveAlignment(*aItr); - cerr << i << " " << aItr->Name << " [" << qual1 << "]" << "\t[" << qual2 << "]" << "\t" << joe << endl << endl; - i++; - } -} - - -// samtools sort comparison function from bam_sort.c -// used for sorting by query name. -int strnum_cmp(const char *a, const char *b) { - char *pa, *pb; - pa = (char*)a; pb = (char*)b; - while (*pa && *pb) { - if (isdigit(*pa) && isdigit(*pb)) { - long ai, bi; - ai = strtol(pa, &pa, 10); - bi = strtol(pb, &pb, 10); - if (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0; - } else { - if (*pa != *pb) break; - ++pa; ++pb; - } - } - if (*pa == *pb) - return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0; - return *pa<*pb? -1 : *pa>*pb? 1 : 0; -} - - -void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Updates the R2 and Q2 BAM tag to include the mate's seq/qual." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bam> " << endl << endl; - -// cerr << "Options: " << endl; - -// cerr << "\t-bedpe\t" << "Write BEDPE format." << endl << endl; - - cerr << "Notes: " << endl; - - cerr << "\tInput BAM must be sorted by read name" << endl << endl; - - // end the program here - exit(1); -} - diff --git a/src/bamToBed/Makefile b/src/bamToBed/Makefile index d9baa942..34096050 100755 --- a/src/bamToBed/Makefile +++ b/src/bamToBed/Makefile @@ -8,14 +8,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= bamToBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=BamReader.o BGZF.o +_EXT_OBJECTS=BamReader.o BamAncillary.o BGZF.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= bamToBed diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index 684e835e..cb9d09af 100755 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -11,7 +11,9 @@ ******************************************************************************/ #include "version.h" #include "BamReader.h" +#include "BamAncillary.h" #include "BamAux.h" +#include "bedFile.h" using namespace BamTools; #include <vector> @@ -34,10 +36,10 @@ using namespace std; void ShowHelp(void); void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, - const bool &writeBed12, const string &color); + const bool &writeBed12, const bool &obeySplits, const string &color); void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance); -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance); +void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, bool obeySplits); void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, string color = "255,0,0"); void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance); @@ -61,6 +63,7 @@ int main(int argc, char* argv[]) { bool writeBedPE = false; bool writeBed12 = false; bool useEditDistance = false; + bool obeySplits = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -93,7 +96,10 @@ int main(int argc, char* argv[]) { } else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { writeBed12 = true; - } + } + else if(PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { useEditDistance = true; } @@ -111,19 +117,23 @@ int main(int argc, char* argv[]) { } // make sure we have an input files - if (!haveBam ) { + if (haveBam == false) { cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; showHelp = true; } - if (haveColor && writeBed12 == false) { + if (haveColor == true && writeBed12 == false) { cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl; showHelp = true; } + if (useEditDistance == true && obeySplits == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." << endl << "*****" << endl; + showHelp = true; + } // if there are no problems, let's convert BAM to BED or BEDPE if (!showHelp) { if (writeBedPE == false) - ConvertBamToBed(bamFile, useEditDistance, writeBed12, color); // BED or "blocked BED" + ConvertBamToBed(bamFile, useEditDistance, writeBed12, obeySplits, color); // BED or "blocked BED" else ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE } @@ -150,7 +160,9 @@ void ShowHelp(void) { cerr << "\t-bed12\t" << "Write \"blocked\" BED format (aka \"BED12\")." << endl << endl; cerr << "\t\thttp://genome-test.cse.ucsc.edu/FAQ/FAQformat#format1" << endl << endl; - + + cerr << "\t-split\t" << "Report \"split\" BAM alignments as separate BED entries." << endl << endl; + cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for BED score." << endl; cerr << "\t\t- Default for BED is to use mapping quality." << endl; cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; @@ -168,7 +180,7 @@ void ShowHelp(void) { void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, - const bool &writeBed12, const string &color) { + const bool &writeBed12, const bool &obeySplits, const string &color) { // open the BAM file BamReader reader; reader.Open(bamFile); @@ -182,7 +194,7 @@ void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, while (reader.GetNextAlignment(bam)) { if (bam.IsMapped() == true) { if (writeBed12 == false) // BED - PrintBed(bam, refs, useEditDistance); + PrintBed(bam, refs, useEditDistance, obeySplits); else //"blocked" BED PrintBed12(bam, refs, useEditDistance, color); } @@ -265,7 +277,7 @@ void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vec } -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance) { +void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, bool obeySplits) { // set the strand string strand = "+"; @@ -279,21 +291,39 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista // get the unpadded (parm = false) end position based on the CIGAR unsigned int alignmentEnd = bam.GetEndPosition(false); - // report the alignment in BED6 format. - if (useEditDistance == false) { - printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); + // report the entire BAM footprint as a single BED entry + if (obeySplits == false) { + // report the alignment in BED6 format. + if (useEditDistance == false) { + printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); + } + else { + uint8_t editDistance; + if (bam.GetEditDistance(editDistance)) { + printf("%s\t%d\t%d\t\%s\t%u\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), editDistance, strand.c_str()); + } + else { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } } + // Report each chunk of the BAM alignment as a discrete BED entry + // For example 10M100N10M would be reported as two seprate BED entries of length 10 else { - uint8_t editDistance; - if (bam.GetEditDistance(editDistance)) { - printf("%s\t%d\t%d\t\%s\t%u\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), editDistance, strand.c_str()); - } - else { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } + vector<BED> bedBlocks; + // Load the alignment blocks in bam into the bedBlocks vector. + // Don't trigger a new block when a "D" (deletion) CIGAR op is found. + getBamBlocks(bam, refs, bedBlocks, false); + + vector<BED>::const_iterator bedItr = bedBlocks.begin(); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bedItr->start, + bedItr->end, name.c_str(), bam.MapQuality, strand.c_str()); + } } } diff --git a/src/bedToBam/Makefile b/src/bedToBam/Makefile index f5493cda..445b60d6 100755 --- a/src/bedToBam/Makefile +++ b/src/bedToBam/Makefile @@ -1,5 +1,5 @@ CXX= g++ -CXXFLAGS= -Wall -O2 +CXXFLAGS= -Wall -O2 -fopenmp LIBS= -lz UTILITIES_DIR = ../utils/ OBJ_DIR = ../../obj/ @@ -8,7 +8,7 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ # ---------------------------------- # define our source and object files diff --git a/src/bedToBam/bedToBam.cpp b/src/bedToBam/bedToBam.cpp index 1290e3cf..eaab9802 100755 --- a/src/bedToBam/bedToBam.cpp +++ b/src/bedToBam/bedToBam.cpp @@ -13,6 +13,8 @@ #include "bedFile.h" #include "genomeFile.h" #include "version.h" +#include <omp.h> + #include "BamWriter.h" #include "BamAux.h" @@ -155,7 +157,7 @@ void ShowHelp(void) { cerr << "Notes: " << endl; - cerr << "\t(1) BED files must be at least BED4 (needs name field)." << endl << endl; + cerr << "\t(1) BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl; // end the program here @@ -202,8 +204,7 @@ void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED1 BedLineStatus bedStatus; // open the BED file for reading. bed->Open(); - bedStatus = bed->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { BamAlignment bamEntry; if (bed->bedType >= 4) { @@ -216,8 +217,10 @@ void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED1 } bedEntry = nullBed; } - bedStatus = bed->GetNextBed(bedEntry, lineNum); } + + + // close up bed->Close(); writer->Close(); diff --git a/src/closestBed/closestBed.cpp b/src/closestBed/closestBed.cpp index 07be3583..3dd0995e 100755 --- a/src/closestBed/closestBed.cpp +++ b/src/closestBed/closestBed.cpp @@ -41,30 +41,6 @@ BedClosest::~BedClosest(void) { } -/* - reportNullB - - Writes a NULL B entry for cases where no closest BED was found - Works for BED3 - BED6. -*/ -void BedClosest::reportNullB() { - if (_bedB->bedType == 3) { - printf("none\t-1\t-1\n"); - } - else if (_bedB->bedType == 4) { - printf("none\t-1\t-1\t-1\n"); - } - else if (_bedB->bedType == 5) { - printf("none\t-1\t-1\t-1\t-1\n"); - } - else if (_bedB->bedType == 6) { - printf("none\t-1\t-1\t-1\t-1\t-1\n"); - } -} - - - - void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { int slop = 0; // start out just looking for overlaps @@ -72,12 +48,12 @@ void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { // update the current feature's start and end - int aFudgeStart = 0; - int aFudgeEnd; + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; int numOverlaps = 0; vector<BED> closestB; float maxOverlap = 0; - int minDistance = 999999999; + CHRPOS minDistance = INT_MAX; if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) { @@ -143,7 +119,7 @@ void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { } else { _bedA->reportBedTab(a); - reportNullB(); + _bedB->reportNullBedNewLine(); } if (numOverlaps > 0) { @@ -186,14 +162,12 @@ void BedClosest::FindClosestBed() { _bedA->Open(); // process each entry in A in search of the closest feature in B - bedStatus = _bedA->GetNextBed(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { FindWindowOverlaps(a, hits); hits.clear(); a = nullBed; } - bedStatus = _bedA->GetNextBed(a, lineNum); } _bedA->Close(); } diff --git a/src/complementBed/complementBed.cpp b/src/complementBed/complementBed.cpp index 275455b3..15043c25 100755 --- a/src/complementBed/complementBed.cpp +++ b/src/complementBed/complementBed.cpp @@ -40,10 +40,11 @@ void BedComplement::ComplementBed() { string currChrom; // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - + masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); + masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); + for (; m != mEnd; ++m) { currChrom = m->first; - int currChromSize = _genome->getChromSize(currChrom); + CHRPOS currChromSize = _genome->getChromSize(currChrom); // bedList is already sorted by start position. vector<BED> bedList = m->second; @@ -63,26 +64,22 @@ void BedComplement::ComplementBed() { } // mask all of the positions spanned by this BED entry. - for (int b = bIt->start; b < bIt->end; b++) { + for (CHRPOS b = bIt->start; b < bIt->end; b++) chromMasks[b] = 1; - } } - unsigned int i = 0; - unsigned int start; + CHRPOS i = 0; + CHRPOS start; while (i < chromMasks.size()) { if (chromMasks[i] == 0) { start = i; - while ((chromMasks[i] == 0) && (i < chromMasks.size())) { + while ((chromMasks[i] == 0) && (i < chromMasks.size())) i++; - } - if (start > 0) { - cout << currChrom << "\t" << start << "\t" << i << endl; - } - else { - cout << currChrom << "\t" << 0 << "\t" << i << endl; - } + if (start > 0) + cout << currChrom << "\t" << start << "\t" << i << endl; + else + cout << currChrom << "\t" << 0 << "\t" << i << endl; } i++; } diff --git a/src/fastaFromBed/fastaFromBed.cpp b/src/fastaFromBed/fastaFromBed.cpp index 537a2abf..84af9e3d 100755 --- a/src/fastaFromBed/fastaFromBed.cpp +++ b/src/fastaFromBed/fastaFromBed.cpp @@ -63,7 +63,7 @@ void Bed2Fa::ReportDNA(const BED &bed, const string &currDNA, const string &curr string dna = currDNA.substr(bed.start, ((bed.end - bed.start))); // revcomp if necessary. Thanks to Thomas Doktor. - if ((_useStrand == true) && (bed.strand == '-')) + if ((_useStrand == true) && (bed.strand == "-")) reverseComplement(dna); if (!(_useName)) { diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index 6059f351..db59b51c 100755 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -214,7 +214,7 @@ void BedGenomeCoverage::CoverageBam(string bamFile) { // process the results of the last chromosome. ReportChromCoverage(chromCov, currChromSize, currChrom, chromDepthHist); - if (_eachBase == false && _bedGraph == false) { + if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) { ReportGenomeCoverage(chromDepthHist); } diff --git a/src/linksBed/linksBed.cpp b/src/linksBed/linksBed.cpp index 9e6e8fe4..2e4a9116 100755 --- a/src/linksBed/linksBed.cpp +++ b/src/linksBed/linksBed.cpp @@ -106,14 +106,11 @@ void BedLinks::CreateLinks() { BedLineStatus bedStatus; _bed->Open(); - bedStatus = _bed->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { - bedEntry.count = 0; WriteURL(bedEntry, base); bedEntry = nullBed; } - bedStatus = _bed->GetNextBed(bedEntry, lineNum); } _bed->Close(); diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp index 6720bf14..870c099e 100755 --- a/src/mergeBed/mergeBed.cpp +++ b/src/mergeBed/mergeBed.cpp @@ -66,8 +66,8 @@ void BedMerge::MergeBed() { // bedList is already sorted by start position. vector<BED> bedList = m->second; - int minStart = INT_MAX; - int maxEnd = 0; + CHRPOS minStart = INT_MAX; + CHRPOS maxEnd = 0; bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. int prev = -1; unsigned int curr = 0; @@ -180,8 +180,9 @@ void BedMerge::MergeBedStranded() { _bed->loadBedFileIntoMapNoBin(); // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - + masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); + masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); + for (; m != mEnd; ++m) { // bedList is already sorted by start position. vector<BED> bedList = m->second; @@ -193,8 +194,8 @@ void BedMerge::MergeBedStranded() { // do two passes, one for each strand. for (unsigned int s = 0; s < strands.size(); s++) { - int minStart = INT_MAX; - int maxEnd = 0; + CHRPOS minStart = INT_MAX; + CHRPOS maxEnd = 0; bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. int prev = -1; unsigned int curr = 0; diff --git a/src/pairToBed/pairToBed.cpp b/src/pairToBed/pairToBed.cpp index e8248c66..5485f96c 100755 --- a/src/pairToBed/pairToBed.cpp +++ b/src/pairToBed/pairToBed.cpp @@ -257,9 +257,9 @@ void BedIntersectPE::FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, con // count of hits on _between_ end of BEDPE // that exceed the requested overlap fraction int numOverlaps = 0; - int spanStart = 0; - int spanEnd = 0; - int spanLength = 0; + CHRPOS spanStart = 0; + CHRPOS spanEnd = 0; + CHRPOS spanLength = 0; if ((type == "ispan") || (type == "notispan")) { spanStart = a.end1; @@ -357,8 +357,7 @@ void BedIntersectPE::IntersectBedPE() { BedLineStatus bedStatus; _bedA->Open(); - bedStatus = _bedA->GetNextBedPE(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { if ( (_searchType == "ispan") || (_searchType == "ospan") || (_searchType == "notispan") || (_searchType == "notospan") ) { @@ -374,7 +373,6 @@ void BedIntersectPE::IntersectBedPE() { } a = nullBedPE; } - bedStatus = _bedA->GetNextBedPE(a, lineNum); } _bedA->Close(); } diff --git a/src/pairToBed/pairToBed.h b/src/pairToBed/pairToBed.h index 479c7bfe..66a26eb8 100755 --- a/src/pairToBed/pairToBed.h +++ b/src/pairToBed/pairToBed.h @@ -82,8 +82,8 @@ private: // initialize BEDPE variables a.start1 = a.start2 = a.end1 = a.end2 = -1; - a.chrom1 = a.chrom2 = a.strand1 = a.strand2 = "."; - + a.chrom1 = a.chrom2 = "."; + a.strand1 = a.strand2 = '.'; uint8_t editDistance1, editDistance2; editDistance1 = editDistance2 = 0; @@ -95,8 +95,8 @@ private: a.chrom1 = refs.at(bam1.RefID).RefName; a.start1 = bam1.Position; a.end1 = bam1.GetEndPosition(); - a.strand1 = "+"; - if (bam1.IsReverseStrand()) a.strand1 = "-"; + a.strand1 = '+'; + if (bam1.IsReverseStrand()) a.strand1 = '-'; // extract the edit distance from the NM tag // if possible. otherwise, complain. @@ -113,8 +113,8 @@ private: a.chrom2 = refs.at(bam2.RefID).RefName; a.start2 = bam2.Position; a.end2 = bam2.GetEndPosition(); - a.strand2 = "+"; - if (bam2.IsReverseStrand()) a.strand2 = "-"; + a.strand2 = '+'; + if (bam2.IsReverseStrand()) a.strand2 = '-'; // extract the edit distance from the NM tag // if possible. otherwise, complain. diff --git a/src/pairToPair/pairToPair.cpp b/src/pairToPair/pairToPair.cpp index 095b62f1..45b9d5a6 100755 --- a/src/pairToPair/pairToPair.cpp +++ b/src/pairToPair/pairToPair.cpp @@ -47,7 +47,7 @@ void PairToPair::IntersectPairs() { _bedB->loadBedPEFileIntoMap(); int lineNum = 0; - vector<BED> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; + vector<BEDCOV> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; // reserve some space hitsA1B1.reserve(100); hitsA1B2.reserve(100); hitsA2B1.reserve(100); hitsA2B2.reserve(100); @@ -55,16 +55,15 @@ void PairToPair::IntersectPairs() { BEDPE a, nullBedPE; _bedA->Open(); - bedStatus = _bedA->GetNextBedPE(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { + // identify overlaps b/w the pairs FindOverlaps(a, hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2, _searchType); // reset space for next BEDPE hitsA1B1.clear(); hitsA1B2.clear(); hitsA2B1.clear(); hitsA2B2.clear(); a = nullBedPE; } - bedStatus = _bedA->GetNextBedPE(a, lineNum); } _bedA->Close(); } @@ -72,15 +71,15 @@ void PairToPair::IntersectPairs() { -void PairToPair::FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hitsA1B2, - vector<BED> &hitsA2B1, vector<BED> &hitsA2B2, string type) { +void PairToPair::FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<BEDCOV> &hitsA1B2, + vector<BEDCOV> &hitsA2B1, vector<BEDCOV> &hitsA2B2, string type) { // list of hits on each end of BEDPE // that exceed the requested overlap fraction - vector<BED> qualityHitsA1B1; - vector<BED> qualityHitsA1B2; - vector<BED> qualityHitsA2B1; - vector<BED> qualityHitsA2B2; + vector<BEDCOV> qualityHitsA1B1; + vector<BEDCOV> qualityHitsA1B2; + vector<BEDCOV> qualityHitsA2B1; + vector<BEDCOV> qualityHitsA2B2; // count of hits on each end of BEDPE // that exceed the requested overlap fraction @@ -121,13 +120,13 @@ void PairToPair::FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> -void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BED> &hits, - vector<BED> &qualityHits, int &numOverlaps) { +void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BEDCOV> &hits, + vector<BEDCOV> &qualityHits, int &numOverlaps) { if (end == 1) { - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); + vector<BEDCOV>::const_iterator h = hits.begin(); + vector<BEDCOV>::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { int s = max(a.start1, h->start); int e = min(a.end1, h->end); @@ -142,8 +141,8 @@ void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vecto } else if (end == 2) { - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); + vector<BEDCOV>::const_iterator h = hits.begin(); + vector<BEDCOV>::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { int s = max(a.start2, h->start); int e = min(a.end2, h->end); @@ -157,25 +156,25 @@ void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vecto } -void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHitsEnd1, - const vector<BED> &qualityHitsEnd2, int &matchCount) { +void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, + const vector<BEDCOV> &qualityHitsEnd2, int &matchCount) { - map<unsigned int, vector<BED>, less<int> > hitsMap; + map<unsigned int, vector<BEDCOV>, less<int> > hitsMap; - for (vector<BED>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { + for (vector<BEDCOV>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { hitsMap[h->count].push_back(*h); matchCount++; } - for (vector<BED>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { + for (vector<BEDCOV>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { hitsMap[h->count].push_back(*h); matchCount++; } - for (map<unsigned int, vector<BED>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { + for (map<unsigned int, vector<BEDCOV>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { if (m->second.size() == 2) { - BED b1 = m->second[0]; - BED b2 = m->second[1]; + BEDCOV b1 = m->second[0]; + BEDCOV b2 = m->second[1]; if (_searchType == "both") { _bedA->reportBedPETab(a); diff --git a/src/pairToPair/pairToPair.h b/src/pairToPair/pairToPair.h index 3dd057d0..a87ee10b 100755 --- a/src/pairToPair/pairToPair.h +++ b/src/pairToPair/pairToPair.h @@ -35,14 +35,14 @@ public: void IntersectPairs(); - void FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hitsA1B2, - vector<BED> &hitsA2B1, vector<BED> &hitsA2B2, string type); + void FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<BEDCOV> &hitsA1B2, + vector<BEDCOV> &hitsA2B1, vector<BEDCOV> &hitsA2B2, string type); - void FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BED> &hits, - vector<BED> &qualityHits, int &numOverlaps); + void FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BEDCOV> &hits, + vector<BEDCOV> &qualityHits, int &numOverlaps); - void FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHitsEnd1, - const vector<BED> &qualityHitsEnd2, int &matchCount); + void FindHitsOnBothEnds(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, + const vector<BEDCOV> &qualityHitsEnd2, int &matchCount); private: diff --git a/src/shuffleBed/shuffleBed.cpp b/src/shuffleBed/shuffleBed.cpp index b08fb8d6..55446e32 100755 --- a/src/shuffleBed/shuffleBed.cpp +++ b/src/shuffleBed/shuffleBed.cpp @@ -71,14 +71,12 @@ void BedShuffle::Shuffle() { BedLineStatus bedStatus; _bed->Open(); - bedStatus = _bed->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { ChooseLocus(bedEntry); _bed->reportBedNewLine(bedEntry); bedEntry = nullBed; } - bedStatus = _bed->GetNextBed(bedEntry, lineNum); } _bed->Close(); } @@ -88,72 +86,76 @@ void BedShuffle::Shuffle() { void BedShuffle::ShuffleWithExclusions() { int lineNum = 0; - BED bedEntry; // used to store the current BED line from the BED file. + BED bedEntry, nullBed; // used to store the current BED line from the BED file. + BedLineStatus bedStatus; vector<BED> hits; hits.reserve(100); _bed->Open(); - while (_bed->GetNextBed(bedEntry, lineNum)) { - - // choose a random locus - ChooseLocus(bedEntry); + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + // choose a random locus + ChooseLocus(bedEntry); - // test to see if the chosen locus overlaps - // with an exclude region - _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, bedEntry.strand, hits, false); + // test to see if the chosen locus overlaps + // with an exclude region + _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, bedEntry.strand, hits, false); - bool haveOverlap = false; - vector<BED>::const_iterator hitsItr = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; hitsItr != hitsEnd; ++hitsItr) { - - int s = max(bedEntry.start, hitsItr->start); - int e = min(bedEntry.end, hitsItr->end); - - if ( (e - s) > 0) { - haveOverlap = true; - break; /* stop looking. one overlap is enough*/ - } - } + bool haveOverlap = false; + vector<BED>::const_iterator hitsItr = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; hitsItr != hitsEnd; ++hitsItr) { + + int s = max(bedEntry.start, hitsItr->start); + int e = min(bedEntry.end, hitsItr->end); + + if ( (e - s) > 0) { + haveOverlap = true; + break; /* stop looking. one overlap is enough*/ + } + } - /* - keep looking as long as the chosen - locus happens to overlap with regions - that the user wishes to exclude. - */ - int tries = 0; - while ((haveOverlap == true) && (tries <= MAX_TRIES)) { - - // choose a new locus - ChooseLocus(bedEntry); - - vector<BED> hits; - _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, - bedEntry.strand, hits, false); - - haveOverlap = false; - vector<BED>::const_iterator hitsItr = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; hitsItr != hitsEnd; ++hitsItr) { - - int s = max(bedEntry.start, hitsItr->start); - int e = min(bedEntry.end, hitsItr->end); - - if ( (e - s) > 0) { - haveOverlap = true; - break; /* stop looking. one overlap is enough*/ - } - } - tries++; - } + /* + keep looking as long as the chosen + locus happens to overlap with regions + that the user wishes to exclude. + */ + int tries = 0; + while ((haveOverlap == true) && (tries <= MAX_TRIES)) { + + // choose a new locus + ChooseLocus(bedEntry); + + vector<BED> hits; + _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, + bedEntry.strand, hits, false); + + haveOverlap = false; + vector<BED>::const_iterator hitsItr = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; hitsItr != hitsEnd; ++hitsItr) { + + int s = max(bedEntry.start, hitsItr->start); + int e = min(bedEntry.end, hitsItr->end); + + if ( (e - s) > 0) { + haveOverlap = true; + break; // stop looking. one overlap is enough + } + } + tries++; + } - if (tries > MAX_TRIES) { - cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl; - } - else { - _bed->reportBedNewLine(bedEntry); - } + if (tries > MAX_TRIES) { + cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl; + } + else { + _bed->reportBedNewLine(bedEntry); + } + } + bedEntry = nullBed; } + _bed->Close(); } @@ -161,13 +163,13 @@ void BedShuffle::ShuffleWithExclusions() { void BedShuffle::ChooseLocus(BED &bedEntry) { string chrom = bedEntry.chrom; - int start = bedEntry.start; - int end = bedEntry.end; - int length = end - start; + CHRPOS start = bedEntry.start; + CHRPOS end = bedEntry.end; + CHRPOS length = end - start; string randomChrom; - int randomStart; - int chromSize; + CHRPOS randomStart; + CHRPOS chromSize; if (_sameChrom == false) { randomChrom = _chroms[rand() % _numChroms]; diff --git a/src/slopBed/slopBed.cpp b/src/slopBed/slopBed.cpp index e70a2711..8da78f56 100755 --- a/src/slopBed/slopBed.cpp +++ b/src/slopBed/slopBed.cpp @@ -58,7 +58,7 @@ void BedSlop::AddSlop(BED &bed) { // special handling if the BED entry is on the negative // strand and the user cares about strandedness. - int chromSize = _genome->getChromSize(bed.chrom); + CHRPOS chromSize = _genome->getChromSize(bed.chrom); if ( (_forceStrand) && (bed.strand == "-") ) { // inspect the start diff --git a/src/subtractBed/subtractBed.cpp b/src/subtractBed/subtractBed.cpp index 94b92ba4..588462ad 100755 --- a/src/subtractBed/subtractBed.cpp +++ b/src/subtractBed/subtractBed.cpp @@ -133,11 +133,11 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { // report the remaining blocks. for (unsigned int i = 0; i < aKeep.size(); ++i) { if (aKeep[i] == true) { - int blockStart = i + a.start; + CHRPOS blockStart = i + a.start; while ((aKeep[i] == true) && (i < aKeep.size())) { i++; } - int blockEnd = i + a.start; + CHRPOS blockEnd = i + a.start; blockEnd = min(a.end, blockEnd); _bedA->reportBedRangeNewLine(a,blockStart,blockEnd); } @@ -162,14 +162,12 @@ void BedSubtract::SubtractBed() { hits.reserve(100); _bedA->Open(); - bedStatus = _bedA->GetNextBed(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { FindAndSubtractOverlaps(a, hits); hits.clear(); a = nullBed; - } - bedStatus = _bedA->GetNextBed(a, lineNum); + } } _bedA->Close(); diff --git a/src/utils/BamTools/BGZF.cpp b/src/utils/BamTools/BGZF.cpp old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BGZF.h b/src/utils/BamTools/BGZF.h old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BamAncillary.cpp b/src/utils/BamTools/BamAncillary.cpp new file mode 100755 index 00000000..b7e149be --- /dev/null +++ b/src/utils/BamTools/BamAncillary.cpp @@ -0,0 +1,65 @@ +/***************************************************************************** + bamAncillary.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0+ license. +******************************************************************************/ +#include "BamAncillary.h" +using namespace std; + +// 10 15 20 25 30 4000 +// acccctttggacct---ataggga.................aaaa +// acccc---ggaccttttataggga.................aaaa +// 5M 3D 6M 2I 7M 20N 4M + +namespace BamTools { + void getBamBlocks(const BamAlignment &bam, const RefVector &refs, + vector<BED> &blocks, bool includeDeletions) { + + CHRPOS currPosition = bam.Position; + CHRPOS blockStart = bam.Position; + string chrom = refs.at(bam.RefID).RefName; + + vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); + vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); + for ( ; cigItr != cigEnd; ++cigItr ) { + switch (cigItr->Type) { + case 'M': + currPosition += cigItr->Length; + blocks.push_back( BED(chrom, blockStart, currPosition) ); + break; + + case 'S': + case 'P': + case 'H': + case 'I': + // Insertion relative to the reference genome. + // Don't advance the current position, since no new nucleotides are covered. + break; + + case 'D': + if (includeDeletions == true) + currPosition += cigItr->Length; + else { + blocks.push_back( BED(chrom, blockStart, currPosition) ); + currPosition += cigItr->Length; + blockStart = currPosition; + } + case 'N': + currPosition += cigItr->Length; + blockStart = currPosition; + break; + + default: + cerr << "Input error: invalid CIGAR type (" << cigItr->Type + << ") for: " << bam.Name << endl; + exit(1); + } + } + } +} \ No newline at end of file diff --git a/src/utils/BamTools/BamAncillary.h b/src/utils/BamTools/BamAncillary.h new file mode 100755 index 00000000..1581efaa --- /dev/null +++ b/src/utils/BamTools/BamAncillary.h @@ -0,0 +1,18 @@ +/***************************************************************************** + bamAncillary.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0+ license. +******************************************************************************/ +#include "bedFile.h" +#include "BamAux.h" + +namespace BamTools { + void getBamBlocks(const BamAlignment &bam, const RefVector &refs, + vector<BED> &blocks, bool includeDeletions = true); +} \ No newline at end of file diff --git a/src/utils/BamTools/BamAux.h b/src/utils/BamTools/BamAux.h old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BamReader.cpp b/src/utils/BamTools/BamReader.cpp old mode 100644 new mode 100755 index a2f975f9..a4b6d0d4 --- a/src/utils/BamTools/BamReader.cpp +++ b/src/utils/BamTools/BamReader.cpp @@ -48,7 +48,7 @@ struct BamReader::BamReaderPrivate { int64_t AlignmentsBeginOffset; string Filename; string IndexFilename; - + // system data bool IsBigEndian; @@ -614,7 +614,7 @@ void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets const uint64_t& lastOffset) { // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; // resize vector if necessary @@ -639,7 +639,7 @@ bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { // read starts after left boundary if ( bAlignment.Position >= CurrentLeft) { return true; } - // return whether alignment end overlaps left boundary + // return whether alignment end overlaps left boundary return ( bAlignment.GetEndPosition() >= CurrentLeft ); } diff --git a/src/utils/BamTools/BamReader.h b/src/utils/BamTools/BamReader.h old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BamWriter.cpp b/src/utils/BamTools/BamWriter.cpp old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BamWriter.h b/src/utils/BamTools/BamWriter.h old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/Makefile b/src/utils/BamTools/Makefile old mode 100644 new mode 100755 index 1bac7b53..d07ad0c8 --- a/src/utils/BamTools/Makefile +++ b/src/utils/BamTools/Makefile @@ -3,11 +3,14 @@ CXXFLAGS = -O2 -Wall LDFLAGS = OBJ_DIR = ../../../obj/ BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../ + +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/gzstream/ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= BamReader.cpp BamWriter.cpp BGZF.cpp +SOURCES= BamReader.cpp BamWriter.cpp BGZF.cpp BamAncillary.cpp OBJECTS= $(SOURCES:.cpp=.o) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index c99dc78f..ec2fe6e1 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -16,7 +16,7 @@ /*********************************************** Sorting comparison functions ************************************************/ -bool sortByChrom(BED const & a, BED const & b) { +bool sortByChrom(BED const &a, BED const &b) { if (a.chrom < b.chrom) return true; else return false; }; @@ -54,8 +54,7 @@ bool sortByScoreDesc(const BED &a, const BED &b) { else return false; }; - -bool byChromThenStart(BED const & a, BED const & b) { +bool byChromThenStart(BED const &a, BED const &b) { if (a.chrom < b.chrom) return true; else if (a.chrom > b.chrom) return false; @@ -67,27 +66,6 @@ bool byChromThenStart(BED const & a, BED const & b) { }; - -/* - NOTE: Tweaked from kent source. -*/ -int getBin(CHRPOS start, CHRPOS end) { - --end; - start >>= _binFirstShift; - end >>= _binFirstShift; - - for (int i = 0; i < _binLevels; ++i) { - if (start == end) return _binOffsetsExtended[i] + start; - start >>= _binNextShift; - end >>= _binNextShift; - } - cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; - return 0; -} - - - - /******************************************* Class methods *******************************************/ @@ -178,18 +156,18 @@ BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum) { } -void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, vector<BED> &hits, bool forceStrand) { +void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand) { - int startBin, endBin; + BIN startBin, endBin; startBin = (start >> _binFirstShift); endBin = ((end-1) >> _binFirstShift); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { + for (BINLEVEL i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to @@ -214,21 +192,21 @@ void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char st } -bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, +bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, bool forceStrand, float overlapFraction) { - int startBin, endBin; - startBin = (start >> _binFirstShift); - endBin = ((end-1) >> _binFirstShift); + BIN startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); - int aLength = (end - start); + CHRPOS aLength = (end - start); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { + for (BINLEVEL i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to @@ -236,10 +214,10 @@ bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); for (; bedItr != bedEnd; ++bedItr) { - int s = max(start, bedItr->start); - int e = min(end, bedItr->end); + CHRPOS s = max(start, bedItr->start); + CHRPOS e = min(end, bedItr->end); // the number of overlapping bases b/w a and b - int overlapBases = (e - s); + CHRPOS overlapBases = (e - s); // do we have sufficient overlap? if ( (float) overlapBases / (float) aLength >= overlapFraction) { @@ -258,21 +236,21 @@ bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end } -bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, +bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, bool forceStrand, float overlapFraction) { - int startBin, endBin; + BIN startBin, endBin; startBin = (start >> _binFirstShift); endBin = ((end-1) >> _binFirstShift); - int aLength = (end - start); + CHRPOS aLength = (end - start); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { + for (BINLEVEL i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to @@ -280,14 +258,14 @@ bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); for (; bedItr != bedEnd; ++bedItr) { - int s = max(start, bedItr->start); - int e = min(end, bedItr->end); + CHRPOS s = max(start, bedItr->start); + CHRPOS e = min(end, bedItr->end); // the number of overlapping bases b/w a and b - int overlapBases = (e - s); + CHRPOS overlapBases = (e - s); // do we have sufficient overlap? if ( (float) overlapBases / (float) aLength >= overlapFraction) { - int bLength = (bedItr->end - bedItr->start); + CHRPOS bLength = (bedItr->end - bedItr->start); float bOverlap = ( (float) overlapBases / (float) bLength ); if ((forceStrand == false) && (bOverlap >= overlapFraction)) { return true; @@ -307,16 +285,16 @@ bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, void BedFile::countHits(const BED &a, bool forceStrand) { - int startBin, endBin; + BIN startBin, endBin; startBin = (a.start >> _binFirstShift); endBin = ((a.end-1) >> _binFirstShift); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { + for (BINLEVEL i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to @@ -353,6 +331,12 @@ void BedFile::setGff (bool gff) { } +void BedFile::setVcf (bool vcf) { + if (vcf == true) this->_isVcf = true; + else this->_isVcf = false; +} + + void BedFile::loadBedFileIntoMap() { BED bedEntry, nullBed; @@ -362,7 +346,7 @@ void BedFile::loadBedFileIntoMap() { Open(); while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { - int bin = getBin(bedEntry.start, bedEntry.end); + BIN bin = getBin(bedEntry.start, bedEntry.end); bedMap[bedEntry.chrom][bin].push_back(bedEntry); bedEntry = nullBed; } @@ -380,7 +364,7 @@ void BedFile::loadBedCovFileIntoMap() { Open(); while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { - int bin = getBin(bedEntry.start, bedEntry.end); + BIN bin = getBin(bedEntry.start, bedEntry.end); BEDCOV bedCov; bedCov.chrom = bedEntry.chrom; diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 36e048c1..5c67005c 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -32,14 +32,17 @@ using namespace std; // Data type tydedef //************************************************* typedef uint32_t CHRPOS; - +typedef uint16_t BINLEVEL; +typedef uint32_t BIN; +typedef uint16_t USHORT; +typedef uint16_t UINT; //************************************************* // Genome binning constants //************************************************* -const int _numBins = 37450; -const int _binLevels = 7; +const BIN _numBins = 37450; +const BINLEVEL _binLevels = 7; // bins range in size from 16kb to 512Mb // Bin 0 spans 512Mbp, # Level 1 @@ -48,11 +51,10 @@ const int _binLevels = 7; // Bins 73-584 span 1Mbp # Level 4 // Bins 585-4680 span 128Kbp # Level 5 // Bins 4681-37449 span 16Kbp # Level 6 -const int _binOffsetsExtended[] = - {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; +const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; -const int _binFirstShift = 14; /* How much to shift to get to finest bin. */ -const int _binNextShift = 3; /* How much to shift to get to next larger bin. */ +const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ +const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ //************************************************* @@ -60,29 +62,57 @@ const int _binNextShift = 3; /* How much to shift to get to next larger bin. * //************************************************* struct DEPTH { - unsigned int starts; - unsigned int ends; + UINT starts; + UINT ends; }; - /* Structure for regular BED records */ struct BED { // Regular BED fields + string chrom; CHRPOS start; CHRPOS end; - - string chrom; string name; string score; - char strand; + string strand; // Add'l fields for BED12 and/or custom BED annotations vector<string> otherFields; -}; + +public: + // constructors + + // Null + BED() + : chrom(""), + start(0), + end(0), + name(""), + score(""), + strand(""), + otherFields() + {} + + // BED3 + BED(string chrom, CHRPOS start, CHRPOS end) + : chrom(chrom), + start(start), + end(end) + {} + + // BED4 + BED(string chrom, CHRPOS start, CHRPOS end, string strand) + : chrom(chrom), + start(start), + end(end), + strand(strand) + {} +}; // BED + /* @@ -97,7 +127,7 @@ struct BEDCOV { string chrom; string name; string score; - char strand; + string strand; // Add'l fields for BED12 and/or custom BED annotations vector<string> otherFields; @@ -121,7 +151,19 @@ enum BedLineStatus // return the genome "bin" for a feature with this start and end inline -int getBin(CHRPOS start, CHRPOS end); +BIN getBin(CHRPOS start, CHRPOS end) { + --end; + start >>= _binFirstShift; + end >>= _binFirstShift; + + for (register short i = 0; i < _binLevels; ++i) { + if (start == end) return _binOffsetsExtended[i] + start; + start >>= _binNextShift; + end >>= _binNextShift; + } + cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; + return 0; +} // return the amount of overlap between two features. Negative if none and the the @@ -163,8 +205,8 @@ bool byChromThenStart(BED const &a, BED const &b); typedef vector<BED> bedVector; typedef vector<BEDCOV> bedCovVector; -typedef map<int, bedVector, std::less<int> > binsToBeds; -typedef map<int, bedCovVector, std::less<int> > binsToBedCovs; +typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; +typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; typedef map<string, binsToBeds, std::less<string> > masterBedMap; typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; @@ -217,14 +259,14 @@ public: // search for all overlapping features in another BED file. // Searches through each relevant genome bin on the same chromosome // as the single feature. Note: Adapted from kent source "binKeeperFind" - void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, vector<BED> &hits, bool forceStrand); + void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand); // return true if at least one overlap was found. otherwise, return false. - bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, + bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, bool forceStrand, float overlapFraction = 0.0); // return true if at least one __reciprocal__ overlap was found. otherwise, return false. - bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, + bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, bool forceStrand, float overlapFraction = 0.0); // Given a chrom, start, end and strand for a single feature, @@ -246,9 +288,11 @@ private: // data bool _isGff; + bool _isVcf; istream *_bedStream; void setGff (bool isGff); + void setVcf (bool isVcf); /****************************************************** @@ -284,6 +328,12 @@ private: if (parseBedLine(bed, lineVector, lineNum) == true) return BED_VALID; } + else if (p2End != lineVector[1].c_str()) { + setGff(false); + setVcf(true); + if (parseVcfLine(bed, lineVector, lineNum) == true) + return BED_VALID; + } else if (lineVector.size() == 9) { // otherwise test if columns 4 and 5 are integers. If so, assume GFF. l4 = strtol(lineVector[3].c_str(), &p4End, 10); @@ -339,12 +389,12 @@ private: else if (this->bedType == 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5][0]; + bed.strand = lineVector[5]; } else if (this->bedType > 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5][0]; + bed.strand = lineVector[5]; for (unsigned int i = 6; i < lineVector.size(); ++i) { bed.otherFields.push_back(lineVector[i]); } @@ -383,12 +433,12 @@ private: else if (this->bedType == 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5][0]; + bed.strand = lineVector[5]; } else if (this->bedType > 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5][0]; + bed.strand = lineVector[5]; for (unsigned int i = 6; i < lineVector.size(); ++i) { bed.otherFields.push_back(lineVector[i]); } @@ -425,6 +475,80 @@ private: } + /* + parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum) { + + if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { + + bed.chrom = lineVector[0]; + bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based + bed.end = atoi(lineVector[1].c_str()); // TO-DO: make this the size of the variant. + bed.strand = "+"; + + if (this->bedType > 2) { + for (unsigned int i = 2; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + } + + if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { + return true; + } + else if (bed.start > bed.end) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + } + else if ((lineNum == 1) && (lineVector.size() >= 6)) { + this->bedType = lineVector.size(); + + bed.chrom = lineVector[0]; + bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based + bed.end = atoi(lineVector[1].c_str()); // TO-DO: make this the size of the variant. + bed.strand = "+"; + + if (this->bedType > 2) { + for (unsigned int i = 2; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + } + + if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { + return true; + } + else if (bed.start > bed.end) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + } + else if (lineVector.size() == 1) { + cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { + cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() < 2) && (lineVector.size() != 0)) { + cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; + } + + + /* parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ @@ -552,12 +676,12 @@ public: bed.score.c_str()); } else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); } else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); vector<string>::const_iterator othIt = bed.otherFields.begin(); vector<string>::const_iterator othEnd = bed.otherFields.end(); @@ -567,9 +691,9 @@ public: } } else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), bed.name.c_str(), bed.start+1, bed.end, - bed.score.c_str(), bed.strand, + bed.score.c_str(), bed.strand.c_str(), bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); } } @@ -597,12 +721,12 @@ public: bed.score.c_str()); } else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); } else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); vector<string>::const_iterator othIt = bed.otherFields.begin(); vector<string>::const_iterator othEnd = bed.otherFields.end(); @@ -613,9 +737,9 @@ public: } } else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), bed.name.c_str(), bed.start+1, bed.end, - bed.score.c_str(), bed.strand, + bed.score.c_str(), bed.strand.c_str(), bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); } } @@ -644,12 +768,12 @@ public: bed.score.c_str()); } else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); } else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); vector<string>::const_iterator othIt = bed.otherFields.begin(); vector<string>::const_iterator othEnd = bed.otherFields.end(); @@ -659,9 +783,9 @@ public: } } else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand, + bed.score.c_str(), bed.strand.c_str(), bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); } } @@ -690,12 +814,12 @@ public: bed.score.c_str()); } else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); } else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); vector<string>::const_iterator othIt = bed.otherFields.begin(); vector<string>::const_iterator othEnd = bed.otherFields.end(); @@ -706,9 +830,9 @@ public: } } else if (this->bedType == 9) { // add 1 to the start for GFF - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand, + bed.score.c_str(), bed.strand.c_str(), bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); } } diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp index 27c7a3cb..20a6ad1f 100755 --- a/src/utils/bedFilePE/bedFilePE.cpp +++ b/src/utils/bedFilePE/bedFilePE.cpp @@ -174,7 +174,7 @@ void BedFilePE::reportBedPENewLine(const BEDPE &a) { a.chrom2.c_str(), a.start2, a.end2, a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - vector<string>::const_iterator othIt = a.otherFields.begin(); + vector<string>::const_iterator othIt = a.otherFields.begin(); vector<string>::const_iterator othEnd = a.otherFields.end(); for ( ; othIt != othEnd; ++othIt) { printf("%s\t", othIt->c_str()); @@ -421,24 +421,24 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co /* Adapted from kent source "binKeeperFind" */ -void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, vector<BED> &hits, bool forceStrand) { +void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, vector<BEDCOV> &hits, bool forceStrand) { int startBin, endBin; startBin = (start >> _binFirstShift); endBin = ((end-1) >> _binFirstShift); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = binOffsetsExtended[i]; + int offset = _binOffsetsExtended[i]; for (int j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to // the list of hits. - vector<BED>::const_iterator bedItr; - vector<BED>::const_iterator bedEnd; + vector<BEDCOV>::const_iterator bedItr; + vector<BEDCOV>::const_iterator bedEnd; if (bEnd == 1) { bedItr = bedMapEnd1[chrom][j].begin(); bedEnd = bedMapEnd1[chrom][j].end(); @@ -480,7 +480,7 @@ void BedFilePE::loadBedPEFileIntoMap() { while (bedStatus != BED_INVALID) { if (bedStatus == BED_VALID) { - BED bedEntry1, bedEntry2; + BEDCOV bedEntry1, bedEntry2; // separate the BEDPE entry into separate // BED entries splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); @@ -501,7 +501,7 @@ void BedFilePE::loadBedPEFileIntoMap() { } -void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, BED &bedEntry1, BED &bedEntry2) { +void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2) { /* Split the BEDPE entry into separate BED entries diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h index 82750752..0e6be51b 100755 --- a/src/utils/bedFilePE/bedFilePE.h +++ b/src/utils/bedFilePE/bedFilePE.h @@ -22,12 +22,12 @@ struct BEDPE { // UCSC BED fields string chrom1; - int start1; - int end1; + CHRPOS start1; + CHRPOS end1; string chrom2; - int start2; - int end2; + CHRPOS start2; + CHRPOS end2; string name; string score; @@ -69,18 +69,18 @@ public: void reportBedPETab(const BEDPE &a); void reportBedPENewLine(const BEDPE &a); void loadBedPEFileIntoMap(); - void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, BED &bedEntry1, BED &bedEntry2); + void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2); - void FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, - vector<BED> &hits, bool forceStrand); + void FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, + vector<BEDCOV> &hits, bool forceStrand); string bedFile; unsigned int bedType; - masterBedMap bedMapEnd1; - masterBedMap bedMapEnd2; + masterBedCovMap bedMapEnd1; + masterBedCovMap bedMapEnd2; private: istream *_bedStream; diff --git a/src/utils/gzstream/COPYING.LIB b/src/utils/gzstream/COPYING.LIB old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/Makefile b/src/utils/gzstream/Makefile old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/README b/src/utils/gzstream/README old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/gzstream.C b/src/utils/gzstream/gzstream.C old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/gzstream.h b/src/utils/gzstream/gzstream.h old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/gzstream.o b/src/utils/gzstream/gzstream.o old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/test_gunzip.o b/src/utils/gzstream/test_gunzip.o old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/test_gzip.o b/src/utils/gzstream/test_gzip.o old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/version b/src/utils/gzstream/version old mode 100644 new mode 100755 diff --git a/src/utils/version/version.h b/src/utils/version/version.h old mode 100644 new mode 100755 diff --git a/src/windowBed/windowBed.cpp b/src/windowBed/windowBed.cpp index 92938c6d..e1809f79 100755 --- a/src/windowBed/windowBed.cpp +++ b/src/windowBed/windowBed.cpp @@ -70,8 +70,8 @@ void BedWindow::FindWindowOverlaps(const BED &a, vector<BED> &hits) { // update the current feature's start and end // according to the slop requested (slop = 0 by default) - int aFudgeStart = 0; - int aFudgeEnd; + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; AddWindow(a, aFudgeStart, aFudgeEnd); /* @@ -118,8 +118,8 @@ bool BedWindow::FindOneOrMoreWindowOverlaps(const BED &a) { // update the current feature's start and end // according to the slop requested (slop = 0 by default) - int aFudgeStart = 0; - int aFudgeEnd; + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; AddWindow(a, aFudgeStart, aFudgeEnd); bool overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, _matchOnStrand); @@ -133,21 +133,19 @@ void BedWindow::WindowIntersectBed() { // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); - BED a; + BED a, nullBed; int lineNum = 0; // current input line number BedLineStatus bedStatus; vector<BED> hits; // vector of potential hits hits.reserve(100); _bedA->Open(); - // process each entry in A - bedStatus = _bedA->GetNextBed(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { FindWindowOverlaps(a, hits); hits.clear(); + a = nullBed; } - bedStatus = _bedA->GetNextBed(a, lineNum); } _bedA->Close(); } @@ -196,7 +194,7 @@ void BedWindow::WindowIntersectBam(string bamFile) { if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); - a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; + a.strand = '+'; if (bam.IsReverseStrand()) a.strand = '-'; if (_bamOutput == true) { overlapsFound = FindOneOrMoreWindowOverlaps(a); @@ -224,7 +222,7 @@ void BedWindow::WindowIntersectBam(string bamFile) { } -void BedWindow::AddWindow(const BED &a, int &fudgeStart, int &fudgeEnd) { +void BedWindow::AddWindow(const BED &a, CHRPOS &fudgeStart, CHRPOS &fudgeEnd) { // Does the user want to treat the windows based on strand? // If so, // if "+", then left is left and right is right diff --git a/src/windowBed/windowBed.h b/src/windowBed/windowBed.h index 4d9b3bbb..b1af9111 100755 --- a/src/windowBed/windowBed.h +++ b/src/windowBed/windowBed.h @@ -60,7 +60,7 @@ private: void WindowIntersectBam(string bamFile); void FindWindowOverlaps(const BED &a, vector<BED> &hits); bool FindOneOrMoreWindowOverlaps(const BED &a); - void AddWindow(const BED &a, int &fudgeStart, int &fudgeEnd); + void AddWindow(const BED &a, CHRPOS &fudgeStart, CHRPOS &fudgeEnd); }; #endif /* WINDOWBED_H */ -- GitLab