Skip to content
Snippets Groups Projects
Commit cf64c2f2 authored by Aaron's avatar Aaron
Browse files

Fixed issue #55: mergeBed now reports 1-based starts for GFF files. Also,

greatly simplified the mergeBed algorithm.  MUCH more elegant now.
parent 2507d4b0
No related branches found
No related tags found
No related merge requests found
......@@ -13,13 +13,19 @@
#include "mergeBed.h"
void ReportMergedNames(const map<string, bool> &names) {
unsigned int n = 0;
printf("\t");
map<string, bool>::const_iterator nameItr = names.begin();
map<string, bool>::const_iterator nameEnd = names.end();
for (; nameItr != nameEnd; ++nameItr) {
if (n < (names.size() - 1)) {cout << nameItr->first << ";";}
else {cout << nameItr->first;}
if (n < (names.size() - 1)) {
cout << nameItr->first << ";";
}
else {
cout << nameItr->first;
}
n++;
}
}
......@@ -51,6 +57,46 @@ BedMerge::~BedMerge(void) {
}
// ===============================================
// Convenience method for reporting merged blocks
// ================================================
void BedMerge::Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount) {
if (_bed->isZeroBased == false) {start++;}
printf("%s\t%d\t%d", chrom.c_str(), start, end);
if (_numEntries == false && _reportNames == false) {
printf("\n");
}
else if (_numEntries) {
printf("\t%d\n", mergeCount);
}
else if (_reportNames) {
ReportMergedNames(names);
printf("\n");
}
}
// =========================================================
// Convenience method for reporting merged blocks by strand
// =========================================================
void BedMerge::ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand) {
if (_bed->isZeroBased == false) {start++;}
printf("%s\t%d\t%d", chrom.c_str(), start, end);
if (_numEntries == false && _reportNames == false) {
printf("\t%s\n", strand.c_str());
}
else if (_numEntries) {
printf("\t%d\t%s\n", mergeCount, strand.c_str());
}
else if (_reportNames) {
ReportMergedNames(names);
printf("\t%s\n", strand.c_str());
}
}
// =====================================================
// = Merge overlapping BED entries into a single entry =
// =====================================================
......@@ -61,110 +107,41 @@ void BedMerge::MergeBed() {
_bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second;
CHRPOS minStart = INT_MAX;
CHRPOS maxEnd = 0;
bool OIP = false; // OIP = Overlap In Progress. Lame, I realize.
int prev = -1;
unsigned int curr = 0;
int mergeCount = 1;
map<string, bool> names;
// loop through the BED entries for this chromosome
// and look for overlaps
for (curr = 0; curr < bedList.size(); ++curr) {
// make sure prev points to an actual element
if (prev < 0) {
prev = curr;
continue;
}
// Is there an overlap between the current and previous entries?
if ( overlaps(bedList[prev].start, bedList[prev].end,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
OIP = true;
mergeCount++;
minStart = min(bedList[prev].start, min(minStart, bedList[curr].start));
maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end));
names[bedList[prev].name] = true;
names[bedList[curr].name] = true;
}
else if ( overlaps(minStart, maxEnd,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
mergeCount++;
minStart = min(minStart, bedList[curr].start);
maxEnd = max(maxEnd, bedList[curr].end);
names[bedList[curr].name] = true;
}
else {
// was there an overlap befor the current entry broke it?
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << endl;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl;
}
}
else {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl;
}
else {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl;
}
// merge overlapping features for this chromosome.
int start = -1;
int end = -1;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
if ((int) bedItr->start > end) {
if (start >= 0) {
Report(chrom, start, end, names, mergeCount);
// reset
mergeCount = 1;
names.clear();
}
// reset things for the next overlapping "block"
OIP = false;
mergeCount = 1;
minStart = INT_MAX;
maxEnd = 0;
names.clear();
names[bedList[curr].name] = true;
}
prev = curr;
}
// clean up based on the last entry for the current chromosome
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << endl;
start = bedItr->start;
end = bedItr->end;
names[bedItr->name] = true;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl;
end = bedItr->end;
mergeCount++;
names[bedItr->name] = true;
}
}
else {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl;
}
else {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl;
}
if (start >= 0) {
Report(chrom, start, end, names, mergeCount);
}
}
}
......@@ -183,7 +160,9 @@ void BedMerge::MergeBedStranded() {
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second;
// make a list of the two strands to merge separately.
......@@ -194,120 +173,41 @@ void BedMerge::MergeBedStranded() {
// do two passes, one for each strand.
for (unsigned int s = 0; s < strands.size(); s++) {
CHRPOS minStart = INT_MAX;
CHRPOS maxEnd = 0;
bool OIP = false; // OIP = Overlap In Progress. Lame, I realize.
int prev = -1;
unsigned int curr = 0;
int mergeCount = 1;
int numOnStrand = 0;
map<string, bool> names;
// loop through the BED entries for this chromosome
// and look for overlaps
for (curr = 0; curr < bedList.size(); ++curr) {
// merge overlapping features for this chromosome.
int start = -1;
int end = -1;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
// if forcing strandedness, move on if the hit
// is not on the current strand.
if (bedList[curr].strand != strands[s]) {
continue; // continue force the next iteration of the for loop.
}
else {
numOnStrand++;
}
// make sure prev points to an actual element on the
// current strand
if (prev < 0) {
if (bedList[curr].strand == strands[s]) {
prev = curr;
}
continue;
}
if ( overlaps(bedList[prev].start, bedList[prev].end,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
OIP = true;
mergeCount++;
minStart = min(bedList[prev].start, min(minStart, bedList[curr].start));
maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end));
names[bedList[prev].name] = true;
names[bedList[curr].name] = true;
}
else if ( overlaps(minStart, maxEnd,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
mergeCount++;
minStart = min(minStart, bedList[curr].start);
maxEnd = max(maxEnd, bedList[curr].end);
names[bedList[curr].name] = true;
}
else {
// was there an overlap before the current entry broke it?
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << "\t" << strands[s] << endl;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl;
}
}
else {
if ((_numEntries) && (numOnStrand > 0)) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl;
}
else if (numOnStrand > 0) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl;
}
if (bedItr->strand != strands[s]) { continue; }
else { numOnStrand++; }
if ((int) bedItr->start > end) {
if (start >= 0) {
ReportStranded(chrom, start, end, names, mergeCount, strands[s]);
// reset
mergeCount = 1;
names.clear();
}
// reset things for the next overlapping "block"
OIP = false;
mergeCount = 1;
minStart = INT_MAX;
maxEnd = 0;
names.clear();
// add the name of the current element in prep for the next block
names[bedList[curr].name] = true;
}
prev = curr;
}
// clean up based on the last entry for the current chromosome
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << "\t" << strands[s] << endl;
start = bedItr->start;
end = bedItr->end;
names[bedItr->name] = true;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl;
end = bedItr->end;
mergeCount++;
names[bedItr->name] = true;
}
}
else {
if ((_numEntries) && (numOnStrand > 0)) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if ((_reportNames) && (numOnStrand > 0)) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl;
}
else if (numOnStrand > 0) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl;
}
if (start >= 0) {
ReportStranded(chrom, start, end, names, mergeCount, strands[s]);
}
}
}
......
......@@ -47,4 +47,6 @@ private:
// instance of a bed file class.
BedFile *_bed;
void Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount);
void ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand);
};
......@@ -466,17 +466,12 @@ void BedFile::countListHits(const BED &a, int index, bool forceStrand) {
}
}
void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; }
void BedFile::setGff (bool gff) {
if (gff == true) this->_isGff = true;
else this->_isGff = false;
}
void BedFile::setGff (bool gff) { this->_isGff = gff; }
void BedFile::setVcf (bool vcf) {
if (vcf == true) this->_isVcf = true;
else this->_isVcf = false;
}
void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; }
void BedFile::setFileType (FileType type) {
......
......@@ -396,6 +396,7 @@ public:
string bedFile;
unsigned int bedType; // 3-6, 12 for BED
// 9 for GFF
bool isZeroBased;
// Main data structires used by BEDTools
masterBedCovMap bedCovMap;
......@@ -412,6 +413,7 @@ private:
FileType _fileType; // what is the file type? (BED? GFF? VCF?)
istream *_bedStream;
void setZeroBased(bool zeroBased);
void setGff (bool isGff);
void setVcf (bool isVcf);
void setFileType (FileType type);
......@@ -459,6 +461,7 @@ private:
// it's BED format if columns 2 and 3 are integers
if (isInteger(lineVector[1]) && isInteger(lineVector[2])) {
setGff(false);
setZeroBased(true);
setFileType(BED_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line
if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
......@@ -467,6 +470,7 @@ private:
else if (isInteger(lineVector[1]) && numFields >= 8) {
setGff(false);
setVcf(true);
setZeroBased(false);
setFileType(VCF_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line
if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
......@@ -474,6 +478,7 @@ private:
// it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total.
else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) {
setGff(true);
setZeroBased(false);
setFileType(GFF_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line
if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment