diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp index 4cb56e762780a87a2a40d12ec073787226b40877..66af69c368c89d37f937803c42d41448fc223b8f 100644 --- a/src/mergeBed/mergeBed.cpp +++ b/src/mergeBed/mergeBed.cpp @@ -13,13 +13,19 @@ #include "mergeBed.h" + void ReportMergedNames(const map<string, bool> &names) { unsigned int n = 0; + printf("\t"); map<string, bool>::const_iterator nameItr = names.begin(); map<string, bool>::const_iterator nameEnd = names.end(); for (; nameItr != nameEnd; ++nameItr) { - if (n < (names.size() - 1)) {cout << nameItr->first << ";";} - else {cout << nameItr->first;} + if (n < (names.size() - 1)) { + cout << nameItr->first << ";"; + } + else { + cout << nameItr->first; + } n++; } } @@ -51,6 +57,46 @@ BedMerge::~BedMerge(void) { } +// =============================================== +// Convenience method for reporting merged blocks +// ================================================ +void BedMerge::Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount) { + if (_bed->isZeroBased == false) {start++;} + + printf("%s\t%d\t%d", chrom.c_str(), start, end); + if (_numEntries == false && _reportNames == false) { + printf("\n"); + } + else if (_numEntries) { + printf("\t%d\n", mergeCount); + } + else if (_reportNames) { + ReportMergedNames(names); + printf("\n"); + } +} + + +// ========================================================= +// Convenience method for reporting merged blocks by strand +// ========================================================= +void BedMerge::ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand) { + if (_bed->isZeroBased == false) {start++;} + + printf("%s\t%d\t%d", chrom.c_str(), start, end); + if (_numEntries == false && _reportNames == false) { + printf("\t%s\n", strand.c_str()); + } + else if (_numEntries) { + printf("\t%d\t%s\n", mergeCount, strand.c_str()); + } + else if (_reportNames) { + ReportMergedNames(names); + printf("\t%s\n", strand.c_str()); + } +} + + // ===================================================== // = Merge overlapping BED entries into a single entry = // ===================================================== @@ -61,110 +107,41 @@ void BedMerge::MergeBed() { _bed->loadBedFileIntoMapNoBin(); // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); + masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); + for (; m != mEnd; ++m) { // bedList is already sorted by start position. + string chrom = m->first; vector<BED> bedList = m->second; - - CHRPOS minStart = INT_MAX; - CHRPOS maxEnd = 0; - bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. - int prev = -1; - unsigned int curr = 0; int mergeCount = 1; map<string, bool> names; - // loop through the BED entries for this chromosome - // and look for overlaps - for (curr = 0; curr < bedList.size(); ++curr) { - - // make sure prev points to an actual element - if (prev < 0) { - prev = curr; - continue; - } - - // Is there an overlap between the current and previous entries? - if ( overlaps(bedList[prev].start, bedList[prev].end, - bedList[curr].start, bedList[curr].end) >= _maxDistance) { - OIP = true; - mergeCount++; - minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); - maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); - - names[bedList[prev].name] = true; - names[bedList[curr].name] = true; - } - else if ( overlaps(minStart, maxEnd, - bedList[curr].start, bedList[curr].end) >= _maxDistance) { - mergeCount++; - minStart = min(minStart, bedList[curr].start); - maxEnd = max(maxEnd, bedList[curr].end); - names[bedList[curr].name] = true; - } - else { - // was there an overlap befor the current entry broke it? - if (OIP) { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; - ReportMergedNames(names); - cout << endl; - } - else { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; - } - } - else { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl; - } - else { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl; - } + // merge overlapping features for this chromosome. + int start = -1; + int end = -1; + vector<BED>::const_iterator bedItr = bedList.begin(); + vector<BED>::const_iterator bedEnd = bedList.end(); + for (; bedItr != bedEnd; ++bedItr) { + if ((int) bedItr->start > end) { + if (start >= 0) { + Report(chrom, start, end, names, mergeCount); + // reset + mergeCount = 1; + names.clear(); } - - // reset things for the next overlapping "block" - OIP = false; - mergeCount = 1; - minStart = INT_MAX; - maxEnd = 0; - - names.clear(); - names[bedList[curr].name] = true; - } - prev = curr; - } - - // clean up based on the last entry for the current chromosome - if (OIP) { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; - ReportMergedNames(names); - cout << endl; + start = bedItr->start; + end = bedItr->end; + names[bedItr->name] = true; } else { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; + end = bedItr->end; + mergeCount++; + names[bedItr->name] = true; } } - else { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl; - } - else { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl; - } + if (start >= 0) { + Report(chrom, start, end, names, mergeCount); } } } @@ -183,7 +160,9 @@ void BedMerge::MergeBedStranded() { masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); for (; m != mEnd; ++m) { + // bedList is already sorted by start position. + string chrom = m->first; vector<BED> bedList = m->second; // make a list of the two strands to merge separately. @@ -194,120 +173,41 @@ void BedMerge::MergeBedStranded() { // do two passes, one for each strand. for (unsigned int s = 0; s < strands.size(); s++) { - CHRPOS minStart = INT_MAX; - CHRPOS maxEnd = 0; - bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. - int prev = -1; - unsigned int curr = 0; int mergeCount = 1; int numOnStrand = 0; map<string, bool> names; - // loop through the BED entries for this chromosome - // and look for overlaps - for (curr = 0; curr < bedList.size(); ++curr) { + // merge overlapping features for this chromosome. + int start = -1; + int end = -1; + vector<BED>::const_iterator bedItr = bedList.begin(); + vector<BED>::const_iterator bedEnd = bedList.end(); + for (; bedItr != bedEnd; ++bedItr) { // if forcing strandedness, move on if the hit // is not on the current strand. - - if (bedList[curr].strand != strands[s]) { - continue; // continue force the next iteration of the for loop. - } - else { - numOnStrand++; - } - - // make sure prev points to an actual element on the - // current strand - if (prev < 0) { - if (bedList[curr].strand == strands[s]) { - prev = curr; - } - continue; - } - - if ( overlaps(bedList[prev].start, bedList[prev].end, - bedList[curr].start, bedList[curr].end) >= _maxDistance) { - OIP = true; - mergeCount++; - minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); - maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); - - names[bedList[prev].name] = true; - names[bedList[curr].name] = true; - } - else if ( overlaps(minStart, maxEnd, - bedList[curr].start, bedList[curr].end) >= _maxDistance) { - mergeCount++; - minStart = min(minStart, bedList[curr].start); - maxEnd = max(maxEnd, bedList[curr].end); - names[bedList[curr].name] = true; - } - else { - - // was there an overlap before the current entry broke it? - if (OIP) { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; - ReportMergedNames(names); - cout << "\t" << strands[s] << endl; - } - else { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; - } - } - else { - if ((_numEntries) && (numOnStrand > 0)) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl; - } - else if (numOnStrand > 0) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; - } + if (bedItr->strand != strands[s]) { continue; } + else { numOnStrand++; } + + if ((int) bedItr->start > end) { + if (start >= 0) { + ReportStranded(chrom, start, end, names, mergeCount, strands[s]); + // reset + mergeCount = 1; + names.clear(); } - - // reset things for the next overlapping "block" - OIP = false; - mergeCount = 1; - minStart = INT_MAX; - maxEnd = 0; - names.clear(); - - // add the name of the current element in prep for the next block - names[bedList[curr].name] = true; - } - prev = curr; - } - - // clean up based on the last entry for the current chromosome - if (OIP) { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; - ReportMergedNames(names); - cout << "\t" << strands[s] << endl; + start = bedItr->start; + end = bedItr->end; + names[bedItr->name] = true; } else { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; + end = bedItr->end; + mergeCount++; + names[bedItr->name] = true; } } - else { - if ((_numEntries) && (numOnStrand > 0)) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; - } - else if ((_reportNames) && (numOnStrand > 0)) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl; - } - else if (numOnStrand > 0) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; - } + if (start >= 0) { + ReportStranded(chrom, start, end, names, mergeCount, strands[s]); } } } diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h index 37b721565bd39a87a62542f117aea9561e2a9c66..d0cb59f92751d7714f817035a80e5fd1515177da 100644 --- a/src/mergeBed/mergeBed.h +++ b/src/mergeBed/mergeBed.h @@ -47,4 +47,6 @@ private: // instance of a bed file class. BedFile *_bed; + void Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount); + void ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand); }; diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 9f243dc3f1d0169321a9b2f36b2de8724c03aeaf..991d9015b285eeabd552610c8e25a7aca89d2f4d 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -466,17 +466,12 @@ void BedFile::countListHits(const BED &a, int index, bool forceStrand) { } } +void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; } -void BedFile::setGff (bool gff) { - if (gff == true) this->_isGff = true; - else this->_isGff = false; -} +void BedFile::setGff (bool gff) { this->_isGff = gff; } -void BedFile::setVcf (bool vcf) { - if (vcf == true) this->_isVcf = true; - else this->_isVcf = false; -} +void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; } void BedFile::setFileType (FileType type) { diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 141765da1c5d2d4c5e6002024b7c2027d0f4551e..c514f7107677d6b77ee4f109fa78fd4b1fc9676e 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -396,6 +396,7 @@ public: string bedFile; unsigned int bedType; // 3-6, 12 for BED // 9 for GFF + bool isZeroBased; // Main data structires used by BEDTools masterBedCovMap bedCovMap; @@ -412,6 +413,7 @@ private: FileType _fileType; // what is the file type? (BED? GFF? VCF?) istream *_bedStream; + void setZeroBased(bool zeroBased); void setGff (bool isGff); void setVcf (bool isVcf); void setFileType (FileType type); @@ -459,6 +461,7 @@ private: // it's BED format if columns 2 and 3 are integers if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { setGff(false); + setZeroBased(true); setFileType(BED_FILETYPE); setBedType(numFields); // we now expect numFields columns in each line if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; @@ -467,6 +470,7 @@ private: else if (isInteger(lineVector[1]) && numFields >= 8) { setGff(false); setVcf(true); + setZeroBased(false); setFileType(VCF_FILETYPE); setBedType(numFields); // we now expect numFields columns in each line if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; @@ -474,6 +478,7 @@ private: // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { setGff(true); + setZeroBased(false); setFileType(GFF_FILETYPE); setBedType(numFields); // we now expect numFields columns in each line if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;