Skip to content
Snippets Groups Projects
Commit dc2a3049 authored by Aaron's avatar Aaron
Browse files

Improvements to shuffleBed, including:

1. A -f (overlapFraction) parameter that defines the maximum overlap
   that a randomized feature can have with an -excl feature.  That is, if a chosen locus
   has more than -f overlap with an -excl feature, a new locus is sought.

2. A new -incl option (thanks to Michael Hoffman and Davide Cittaro) that, defines
   intervals in which the randomized features should be placed.  This is used instead
   of placing the features randomly in the genome.  Note that a genome file is still
   required so that a randomized feature does not go beyond the end of a chromosome.
parent fdee9c46
No related branches found
No related tags found
No related merge requests found
......@@ -13,14 +13,19 @@
#include "shuffleBed.h"
BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, bool &haveSeed, bool &haveExclude, bool &sameChrom, int &seed) {
BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, string &includeFile,
bool haveSeed, bool haveExclude, bool haveInclude, bool sameChrom,
float overlapFraction, int seed) {
_bedFile = bedFile;
_genomeFile = genomeFile;
_excludeFile = excludeFile;
_sameChrom = sameChrom;
_haveExclude = haveExclude;
_haveSeed = haveSeed;
_bedFile = bedFile;
_genomeFile = genomeFile;
_excludeFile = excludeFile;
_includeFile = includeFile;
_sameChrom = sameChrom;
_haveExclude = haveExclude;
_haveInclude = haveInclude;
_overlapFraction = overlapFraction;
_haveSeed = haveSeed;
// use the supplied seed for the random
......@@ -44,19 +49,25 @@ BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile,
_exclude = new BedFile(excludeFile);
_exclude->loadBedFileIntoMap();
}
if (_bed->bedFile != "stdin") { // process a file
if (_haveExclude)
ShuffleWithExclusions();
else
Shuffle();
}
else { // process stdin
if (_haveExclude)
ShuffleWithExclusions();
else
Shuffle();
if (_haveInclude) {
_include = new BedFile(includeFile);
_include->loadBedFileIntoMapNoBin();
masterBedMapNoBin::const_iterator it = _include->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator itEnd = _include->bedMapNoBin.end();
for(; it != itEnd; ++it) {
_includeChroms.push_back(it->first);
_numIncludeChroms++;
}
}
if (_haveExclude == true && _haveInclude == false)
ShuffleWithExclusions();
else if (_haveExclude == false && _haveInclude == true)
ShuffleWithInclusions();
else
Shuffle();
}
......@@ -89,63 +100,24 @@ void BedShuffle::ShuffleWithExclusions() {
int lineNum = 0;
BED bedEntry, nullBed; // used to store the current BED line from the BED file.
BedLineStatus bedStatus;
vector<BED> hits;
hits.reserve(100);
_bed->Open();
while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
if (bedStatus == BED_VALID) {
// choose a random locus
ChooseLocus(bedEntry);
// test to see if the chosen locus overlaps
// with an exclude region
_exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, bedEntry.strand, hits, false);
// keep looking as long as the chosen
// locus happens to overlap with regions
// that the user wishes to exclude.
int tries = 0;
bool haveOverlap = false;
vector<BED>::const_iterator hitsItr = hits.begin();
vector<BED>::const_iterator hitsEnd = hits.end();
for (; hitsItr != hitsEnd; ++hitsItr) {
int s = max(bedEntry.start, hitsItr->start);
int e = min(bedEntry.end, hitsItr->end);
if ( (e - s) > 0) {
haveOverlap = true;
break; /* stop looking. one overlap is enough*/
}
}
/*
keep looking as long as the chosen
locus happens to overlap with regions
that the user wishes to exclude.
*/
int tries = 0;
while ((haveOverlap == true) && (tries <= MAX_TRIES)) {
do
{
// choose a new locus
ChooseLocus(bedEntry);
vector<BED> hits;
_exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end,
bedEntry.strand, hits, false);
haveOverlap = false;
vector<BED>::const_iterator hitsItr = hits.begin();
vector<BED>::const_iterator hitsEnd = hits.end();
for (; hitsItr != hitsEnd; ++hitsItr) {
int s = max(bedEntry.start, hitsItr->start);
int e = min(bedEntry.end, hitsItr->end);
if ( (e - s) > 0) {
haveOverlap = true;
break; // stop looking. one overlap is enough
}
}
haveOverlap = _exclude->FindOneOrMoreOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end,
bedEntry.strand, false, _overlapFraction);
tries++;
}
} while ((haveOverlap == true) && (tries <= MAX_TRIES));
if (tries > MAX_TRIES) {
cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl;
......@@ -160,6 +132,24 @@ void BedShuffle::ShuffleWithExclusions() {
}
void BedShuffle::ShuffleWithInclusions() {
int lineNum = 0;
BED bedEntry, nullBed; // used to store the current BED line from the BED file.
BedLineStatus bedStatus;
_bed->Open();
while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
if (bedStatus == BED_VALID) {
// choose a new locus
ChooseLocusFromInclusionFile(bedEntry);
_bed->reportBedNewLine(bedEntry);
}
bedEntry = nullBed;
}
_bed->Close();
}
void BedShuffle::ChooseLocus(BED &bedEntry) {
......@@ -208,3 +198,44 @@ void BedShuffle::ChooseLocus(BED &bedEntry) {
}
}
void BedShuffle::ChooseLocusFromInclusionFile(BED &bedEntry) {
string chrom = bedEntry.chrom;
CHRPOS start = bedEntry.start;
CHRPOS end = bedEntry.end;
CHRPOS length = end - start;
string randomChrom;
CHRPOS randomStart;
BED includeInterval;
if (_sameChrom == false) {
// grab a random chromosome from the inclusion file.
randomChrom = _includeChroms[rand() % _numIncludeChroms];
// get the number of inclusion intervals for that chrom
size_t size = _include->bedMapNoBin[randomChrom].size();
// grab a random interval on the chosen chromosome.
size_t interval = rand() % size;
//cout << interval << endl;
includeInterval = _include->bedMapNoBin[randomChrom][interval];
bedEntry.chrom = randomChrom;
}
else {
// get the number of inclusion intervals for the original chrom
size_t size = _include->bedMapNoBin[chrom].size();
// grab a random interval on the chosen chromosome.
includeInterval = _include->bedMapNoBin[chrom][rand() % size];
}
randomStart = includeInterval.start + rand() % (includeInterval.size());
bedEntry.start = randomStart;
bedEntry.end = randomStart + length;
// ensure that the chosen location doesn't go past
if (bedEntry.end > ((size_t) _genome->getChromSize(chrom))) {
bedEntry.end = _genome->getChromSize(chrom);
}
}
......@@ -33,8 +33,9 @@ class BedShuffle {
public:
// constructor
BedShuffle(string &bedFile, string &genomeFile, string &excludeFile,
bool &haveSeed, bool &haveExclude, bool &sameChrom, int &seed);
BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, string &includeFile,
bool haveSeed, bool haveExclude, bool haveInclude, bool sameChrom,
float overlapFraction, int seed);
// destructor
~BedShuffle(void);
......@@ -44,24 +45,32 @@ private:
string _bedFile;
string _genomeFile;
string _excludeFile;
string _includeFile;
float _overlapFraction;
int _seed;
bool _sameChrom;
bool _haveExclude;
bool _haveInclude;
bool _haveSeed;
// The BED file from which to compute coverage.
BedFile *_bed;
BedFile *_exclude;
BedFile *_include;
GenomeFile *_genome;
vector<string> _chroms;
int _numChroms;
vector<string> _includeChroms;
int _numIncludeChroms;
// methods
void Shuffle();
void ShuffleWithExclusions();
void ShuffleWithInclusions();
void ChooseLocus(BED &);
void ChooseLocusFromInclusionFile(BED &);
};
......@@ -32,14 +32,17 @@ int main(int argc, char* argv[]) {
// input files
string bedFile = "stdin";
string excludeFile;
string includeFile;
string genomeFile;
bool haveBed = true;
bool haveGenome = false;
bool haveExclude = false;
bool haveSeed = false;
int seed = -1;
bool sameChrom = false;
bool haveBed = true;
bool haveGenome = false;
bool haveExclude = false;
bool haveInclude = false;
bool haveSeed = false;
float overlapFraction = 0.0;
int seed = -1;
bool sameChrom = false;
for(int i = 1; i < argc; i++) {
......@@ -78,6 +81,13 @@ int main(int argc, char* argv[]) {
i++;
}
}
else if(PARAMETER_CHECK("-incl", 5, parameterLength)) {
if ((i+1) < argc) {
haveInclude = true;
includeFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-seed", 5, parameterLength)) {
if ((i+1) < argc) {
haveSeed = true;
......@@ -88,6 +98,12 @@ int main(int argc, char* argv[]) {
else if(PARAMETER_CHECK("-chrom", 6, parameterLength)) {
sameChrom = true;
}
else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
if ((i+1) < argc) {
overlapFraction = atof(argv[i + 1]);
i++;
}
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
......@@ -99,9 +115,16 @@ int main(int argc, char* argv[]) {
cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl;
showHelp = true;
}
if (!haveInclude && !haveExclude) {
cerr << endl << "*****" << endl << "*****ERROR: Cannot use -incl and -excl together." << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedShuffle *bc = new BedShuffle(bedFile, genomeFile, excludeFile, haveSeed, haveExclude, sameChrom, seed);
BedShuffle *bc = new BedShuffle(bedFile, genomeFile, excludeFile, includeFile,
haveSeed, haveExclude, haveInclude, sameChrom,
overlapFraction, seed);
delete bc;
return 0;
}
......@@ -124,6 +147,10 @@ void ShowHelp(void) {
cerr << "\t-excl\t" << "A BED/GFF/VCF file of coordinates in which features in -i" << endl;
cerr << "\t\tshould not be placed (e.g. gaps.bed)." << endl << endl;
cerr << "\t-incl\t" << "Instead of randomly placing features in a genome, the -incl" << endl;
cerr << "\t\toptions defines a BED/GFF/VCF file of coordinates in which " << endl;
cerr << "\t\tfeatures in -i should be randomly placed (e.g. genes.bed). " << endl << endl;
cerr << "\t-chrom\t" << "Keep features in -i on the same chromosome."<< endl;
cerr << "\t\t- By default, the chrom and position are randomly chosen." << endl << endl;
......@@ -131,6 +158,13 @@ void ShowHelp(void) {
cerr << "\t\t- By default, the seed is chosen automatically." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "\t-f\t" << "Maximum overlap (as a fraction of the -i feature) with an -excl" << endl;
cerr << "\t\tfeature that is tolerated before searching for a new, " << endl;
cerr << "\t\trandomized locus. For example, -f 0.10 allows up to 10%" << endl;
cerr << "\t\tof a randomized feature to overlap with a given feature" << endl;
cerr << "\t\tin the -excl file. **Cannot be used with -incl file.**" << endl;
cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl;
cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment