Skip to content
Snippets Groups Projects
Commit cf64c2f2 authored by Aaron's avatar Aaron
Browse files

Fixed issue #55: mergeBed now reports 1-based starts for GFF files. Also,

greatly simplified the mergeBed algorithm.  MUCH more elegant now.
parent 2507d4b0
No related branches found
No related tags found
No related merge requests found
...@@ -13,13 +13,19 @@ ...@@ -13,13 +13,19 @@
#include "mergeBed.h" #include "mergeBed.h"
void ReportMergedNames(const map<string, bool> &names) { void ReportMergedNames(const map<string, bool> &names) {
unsigned int n = 0; unsigned int n = 0;
printf("\t");
map<string, bool>::const_iterator nameItr = names.begin(); map<string, bool>::const_iterator nameItr = names.begin();
map<string, bool>::const_iterator nameEnd = names.end(); map<string, bool>::const_iterator nameEnd = names.end();
for (; nameItr != nameEnd; ++nameItr) { for (; nameItr != nameEnd; ++nameItr) {
if (n < (names.size() - 1)) {cout << nameItr->first << ";";} if (n < (names.size() - 1)) {
else {cout << nameItr->first;} cout << nameItr->first << ";";
}
else {
cout << nameItr->first;
}
n++; n++;
} }
} }
...@@ -51,6 +57,46 @@ BedMerge::~BedMerge(void) { ...@@ -51,6 +57,46 @@ BedMerge::~BedMerge(void) {
} }
// ===============================================
// Convenience method for reporting merged blocks
// ================================================
void BedMerge::Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount) {
if (_bed->isZeroBased == false) {start++;}
printf("%s\t%d\t%d", chrom.c_str(), start, end);
if (_numEntries == false && _reportNames == false) {
printf("\n");
}
else if (_numEntries) {
printf("\t%d\n", mergeCount);
}
else if (_reportNames) {
ReportMergedNames(names);
printf("\n");
}
}
// =========================================================
// Convenience method for reporting merged blocks by strand
// =========================================================
void BedMerge::ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand) {
if (_bed->isZeroBased == false) {start++;}
printf("%s\t%d\t%d", chrom.c_str(), start, end);
if (_numEntries == false && _reportNames == false) {
printf("\t%s\n", strand.c_str());
}
else if (_numEntries) {
printf("\t%d\t%s\n", mergeCount, strand.c_str());
}
else if (_reportNames) {
ReportMergedNames(names);
printf("\t%s\n", strand.c_str());
}
}
// ===================================================== // =====================================================
// = Merge overlapping BED entries into a single entry = // = Merge overlapping BED entries into a single entry =
// ===================================================== // =====================================================
...@@ -61,110 +107,41 @@ void BedMerge::MergeBed() { ...@@ -61,110 +107,41 @@ void BedMerge::MergeBed() {
_bed->loadBedFileIntoMapNoBin(); _bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries // loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position. // bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second; vector<BED> bedList = m->second;
CHRPOS minStart = INT_MAX;
CHRPOS maxEnd = 0;
bool OIP = false; // OIP = Overlap In Progress. Lame, I realize.
int prev = -1;
unsigned int curr = 0;
int mergeCount = 1; int mergeCount = 1;
map<string, bool> names; map<string, bool> names;
// loop through the BED entries for this chromosome // merge overlapping features for this chromosome.
// and look for overlaps int start = -1;
for (curr = 0; curr < bedList.size(); ++curr) { int end = -1;
vector<BED>::const_iterator bedItr = bedList.begin();
// make sure prev points to an actual element vector<BED>::const_iterator bedEnd = bedList.end();
if (prev < 0) { for (; bedItr != bedEnd; ++bedItr) {
prev = curr; if ((int) bedItr->start > end) {
continue; if (start >= 0) {
} Report(chrom, start, end, names, mergeCount);
// reset
// Is there an overlap between the current and previous entries? mergeCount = 1;
if ( overlaps(bedList[prev].start, bedList[prev].end, names.clear();
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
OIP = true;
mergeCount++;
minStart = min(bedList[prev].start, min(minStart, bedList[curr].start));
maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end));
names[bedList[prev].name] = true;
names[bedList[curr].name] = true;
}
else if ( overlaps(minStart, maxEnd,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
mergeCount++;
minStart = min(minStart, bedList[curr].start);
maxEnd = max(maxEnd, bedList[curr].end);
names[bedList[curr].name] = true;
}
else {
// was there an overlap befor the current entry broke it?
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << endl;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl;
}
}
else {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl;
}
else {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl;
}
} }
start = bedItr->start;
// reset things for the next overlapping "block" end = bedItr->end;
OIP = false; names[bedItr->name] = true;
mergeCount = 1;
minStart = INT_MAX;
maxEnd = 0;
names.clear();
names[bedList[curr].name] = true;
}
prev = curr;
}
// clean up based on the last entry for the current chromosome
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << endl;
} }
else { else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; end = bedItr->end;
mergeCount++;
names[bedItr->name] = true;
} }
} }
else { if (start >= 0) {
if (_numEntries) { Report(chrom, start, end, names, mergeCount);
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl;
}
else {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl;
}
} }
} }
} }
...@@ -183,7 +160,9 @@ void BedMerge::MergeBedStranded() { ...@@ -183,7 +160,9 @@ void BedMerge::MergeBedStranded() {
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) { for (; m != mEnd; ++m) {
// bedList is already sorted by start position. // bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second; vector<BED> bedList = m->second;
// make a list of the two strands to merge separately. // make a list of the two strands to merge separately.
...@@ -194,120 +173,41 @@ void BedMerge::MergeBedStranded() { ...@@ -194,120 +173,41 @@ void BedMerge::MergeBedStranded() {
// do two passes, one for each strand. // do two passes, one for each strand.
for (unsigned int s = 0; s < strands.size(); s++) { for (unsigned int s = 0; s < strands.size(); s++) {
CHRPOS minStart = INT_MAX;
CHRPOS maxEnd = 0;
bool OIP = false; // OIP = Overlap In Progress. Lame, I realize.
int prev = -1;
unsigned int curr = 0;
int mergeCount = 1; int mergeCount = 1;
int numOnStrand = 0; int numOnStrand = 0;
map<string, bool> names; map<string, bool> names;
// loop through the BED entries for this chromosome // merge overlapping features for this chromosome.
// and look for overlaps int start = -1;
for (curr = 0; curr < bedList.size(); ++curr) { int end = -1;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
// if forcing strandedness, move on if the hit // if forcing strandedness, move on if the hit
// is not on the current strand. // is not on the current strand.
if (bedItr->strand != strands[s]) { continue; }
if (bedList[curr].strand != strands[s]) { else { numOnStrand++; }
continue; // continue force the next iteration of the for loop.
} if ((int) bedItr->start > end) {
else { if (start >= 0) {
numOnStrand++; ReportStranded(chrom, start, end, names, mergeCount, strands[s]);
} // reset
mergeCount = 1;
// make sure prev points to an actual element on the names.clear();
// current strand
if (prev < 0) {
if (bedList[curr].strand == strands[s]) {
prev = curr;
}
continue;
}
if ( overlaps(bedList[prev].start, bedList[prev].end,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
OIP = true;
mergeCount++;
minStart = min(bedList[prev].start, min(minStart, bedList[curr].start));
maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end));
names[bedList[prev].name] = true;
names[bedList[curr].name] = true;
}
else if ( overlaps(minStart, maxEnd,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
mergeCount++;
minStart = min(minStart, bedList[curr].start);
maxEnd = max(maxEnd, bedList[curr].end);
names[bedList[curr].name] = true;
}
else {
// was there an overlap before the current entry broke it?
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << "\t" << strands[s] << endl;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl;
}
}
else {
if ((_numEntries) && (numOnStrand > 0)) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl;
}
else if (numOnStrand > 0) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl;
}
} }
start = bedItr->start;
// reset things for the next overlapping "block" end = bedItr->end;
OIP = false; names[bedItr->name] = true;
mergeCount = 1;
minStart = INT_MAX;
maxEnd = 0;
names.clear();
// add the name of the current element in prep for the next block
names[bedList[curr].name] = true;
}
prev = curr;
}
// clean up based on the last entry for the current chromosome
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << "\t" << strands[s] << endl;
} }
else { else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; end = bedItr->end;
mergeCount++;
names[bedItr->name] = true;
} }
} }
else { if (start >= 0) {
if ((_numEntries) && (numOnStrand > 0)) { ReportStranded(chrom, start, end, names, mergeCount, strands[s]);
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if ((_reportNames) && (numOnStrand > 0)) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl;
}
else if (numOnStrand > 0) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl;
}
} }
} }
} }
......
...@@ -47,4 +47,6 @@ private: ...@@ -47,4 +47,6 @@ private:
// instance of a bed file class. // instance of a bed file class.
BedFile *_bed; BedFile *_bed;
void Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount);
void ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand);
}; };
...@@ -466,17 +466,12 @@ void BedFile::countListHits(const BED &a, int index, bool forceStrand) { ...@@ -466,17 +466,12 @@ void BedFile::countListHits(const BED &a, int index, bool forceStrand) {
} }
} }
void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; }
void BedFile::setGff (bool gff) { void BedFile::setGff (bool gff) { this->_isGff = gff; }
if (gff == true) this->_isGff = true;
else this->_isGff = false;
}
void BedFile::setVcf (bool vcf) { void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; }
if (vcf == true) this->_isVcf = true;
else this->_isVcf = false;
}
void BedFile::setFileType (FileType type) { void BedFile::setFileType (FileType type) {
......
...@@ -396,6 +396,7 @@ public: ...@@ -396,6 +396,7 @@ public:
string bedFile; string bedFile;
unsigned int bedType; // 3-6, 12 for BED unsigned int bedType; // 3-6, 12 for BED
// 9 for GFF // 9 for GFF
bool isZeroBased;
// Main data structires used by BEDTools // Main data structires used by BEDTools
masterBedCovMap bedCovMap; masterBedCovMap bedCovMap;
...@@ -412,6 +413,7 @@ private: ...@@ -412,6 +413,7 @@ private:
FileType _fileType; // what is the file type? (BED? GFF? VCF?) FileType _fileType; // what is the file type? (BED? GFF? VCF?)
istream *_bedStream; istream *_bedStream;
void setZeroBased(bool zeroBased);
void setGff (bool isGff); void setGff (bool isGff);
void setVcf (bool isVcf); void setVcf (bool isVcf);
void setFileType (FileType type); void setFileType (FileType type);
...@@ -459,6 +461,7 @@ private: ...@@ -459,6 +461,7 @@ private:
// it's BED format if columns 2 and 3 are integers // it's BED format if columns 2 and 3 are integers
if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { if (isInteger(lineVector[1]) && isInteger(lineVector[2])) {
setGff(false); setGff(false);
setZeroBased(true);
setFileType(BED_FILETYPE); setFileType(BED_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line setBedType(numFields); // we now expect numFields columns in each line
if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
...@@ -467,6 +470,7 @@ private: ...@@ -467,6 +470,7 @@ private:
else if (isInteger(lineVector[1]) && numFields >= 8) { else if (isInteger(lineVector[1]) && numFields >= 8) {
setGff(false); setGff(false);
setVcf(true); setVcf(true);
setZeroBased(false);
setFileType(VCF_FILETYPE); setFileType(VCF_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line setBedType(numFields); // we now expect numFields columns in each line
if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
...@@ -474,6 +478,7 @@ private: ...@@ -474,6 +478,7 @@ private:
// it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total.
else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) {
setGff(true); setGff(true);
setZeroBased(false);
setFileType(GFF_FILETYPE); setFileType(GFF_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line setBedType(numFields); // we now expect numFields columns in each line
if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment