Skip to content
Snippets Groups Projects
Commit 956dc0f3 authored by Aaron's avatar Aaron
Browse files

shuffleBed now weights by chrom size by def.

parent 166bee78
No related branches found
No related tags found
No related merge requests found
......@@ -15,7 +15,7 @@
BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, string &includeFile,
bool haveSeed, bool haveExclude, bool haveInclude, bool sameChrom,
float overlapFraction, int seed) {
float overlapFraction, int seed, bool chooseChrom) {
_bedFile = bedFile;
_genomeFile = genomeFile;
......@@ -26,6 +26,7 @@ BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile,
_haveInclude = haveInclude;
_overlapFraction = overlapFraction;
_haveSeed = haveSeed;
_chooseChrom = chooseChrom;
// use the supplied seed for the random
......@@ -45,6 +46,7 @@ BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile,
_genome = new GenomeFile(genomeFile);
_chroms = _genome->getChromList();
_numChroms = _genome->getNumberOfChroms();
_genomeSize = _genome->getGenomeSize();
if (_haveExclude) {
_exclude = new BedFile(excludeFile);
......@@ -146,48 +148,55 @@ void BedShuffle::ShuffleWithInclusions() {
void BedShuffle::ChooseLocus(BED &bedEntry) {
string chrom = bedEntry.chrom;
CHRPOS start = bedEntry.start;
CHRPOS end = bedEntry.end;
CHRPOS length = end - start;
string randomChrom;
CHRPOS randomStart;
CHRPOS chromSize;
string chrom = bedEntry.chrom;
CHRPOS start = bedEntry.start;
CHRPOS end = bedEntry.end;
CHRPOS length = end - start;
if (_sameChrom == false) {
randomChrom = _chroms[rand() % _numChroms];
chromSize = _genome->getChromSize(randomChrom);
randomStart = rand() % chromSize;
bedEntry.chrom = randomChrom;
bedEntry.start = randomStart;
bedEntry.end = randomStart + length;
// choose a position randomly among the _entire_ genome.
if (_chooseChrom == false)
{
do
{
// we need to combine two consective calls to rand()
// because RAND_MAX is 2^31 (2147483648), whereas
// mammalian genomes are obviously much larger.
uint32_t randStart = ((rand() << 31) | rand()) % _genomeSize;
// use the above randomStart (e.g., for human 0..3.1billion)
// to identify the chrom and start on that chrom.
pair<string, int> location = _genome->projectOnGenome(randStart);
bedEntry.chrom = location.first;
bedEntry.start = location.second;
bedEntry.end = bedEntry.start + length;
chromSize = _genome->getChromSize(location.first);
} while (bedEntry.end > chromSize);
// keep looking if we have exceeded the end of the chrom.
}
else {
chromSize = _genome->getChromSize(chrom);
randomStart = rand() % chromSize;
bedEntry.start = randomStart;
bedEntry.end = randomStart + length;
}
// ensure that the chosen location doesn't go past
// the length of the chromosome. if so, keep looking
// for a new spot.
while (bedEntry.end > chromSize) {
if (_sameChrom == false) {
randomChrom = _chroms[rand() % _numChroms];
chromSize = _genome->getChromSize(randomChrom);
randomStart = rand() % chromSize;
bedEntry.chrom = randomChrom;
bedEntry.start = randomStart;
bedEntry.end = randomStart + length;
}
else {
chromSize = _genome->getChromSize(chrom);
randomStart = rand() % chromSize;
bedEntry.start = randomStart;
bedEntry.end = randomStart + length;
}
// OLD, quite arguably flawed, method.
// 1. Choose a chrom randomly (i.e., not weighted by size)
// 2. Choose a position on that chrom randomly
else
{
do
{
if (_sameChrom == false) {
randomChrom = _chroms[rand() % _numChroms];
chromSize = _genome->getChromSize(randomChrom);
randomStart = rand() % chromSize;
bedEntry.chrom = randomChrom;
bedEntry.start = randomStart;
bedEntry.end = randomStart + length;
}
else {
chromSize = _genome->getChromSize(chrom);
randomStart = rand() % chromSize;
bedEntry.start = randomStart;
bedEntry.end = randomStart + length;
}
} while (bedEntry.end > chromSize);
}
}
......@@ -212,7 +221,7 @@ void BedShuffle::ChooseLocusFromInclusionFile(BED &bedEntry) {
// retreive a ranom -incl interval on the selected chrom
includeInterval = _include->bedMapNoBin[randomChrom][interval];
bedEntry.chrom = randomChrom;
bedEntry.chrom = randomChrom;
}
else {
// get the number of inclusion intervals for the original chrom
......
......@@ -21,6 +21,7 @@
#include <sys/time.h>
#include <unistd.h>
#include <sys/types.h>
#include <algorithm> // for binary search
using namespace std;
const int MAX_TRIES = 1000000;
......@@ -35,7 +36,7 @@ public:
// constructor
BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, string &includeFile,
bool haveSeed, bool haveExclude, bool haveInclude, bool sameChrom,
float overlapFraction, int seed);
float overlapFraction, int seed, bool chooseChrom);
// destructor
~BedShuffle(void);
......@@ -52,6 +53,7 @@ private:
bool _haveExclude;
bool _haveInclude;
bool _haveSeed;
bool _chooseChrom;
// The BED file from which to compute coverage.
......@@ -65,6 +67,7 @@ private:
int _numChroms;
vector<string> _includeChroms;
int _numIncludeChroms;
uint32_t _genomeSize;
// methods
void Shuffle();
......
......@@ -42,6 +42,7 @@ int shuffle_main(int argc, char* argv[]) {
float overlapFraction = 0.0;
int seed = -1;
bool sameChrom = false;
bool chooseChrom = false;
for(int i = 1; i < argc; i++) {
......@@ -97,6 +98,9 @@ int shuffle_main(int argc, char* argv[]) {
else if(PARAMETER_CHECK("-chrom", 6, parameterLength)) {
sameChrom = true;
}
else if(PARAMETER_CHECK("-chromFirst", 11, parameterLength)) {
chooseChrom = true;
}
else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
if ((i+1) < argc) {
overlapFraction = atof(argv[i + 1]);
......@@ -119,11 +123,16 @@ int shuffle_main(int argc, char* argv[]) {
cerr << endl << "*****" << endl << "*****ERROR: Cannot use -incl and -excl together." << endl << "*****" << endl;
showHelp = true;
}
if (sameChrom && !chooseChrom) {
cerr << endl << "*****" << endl << "*****ERROR: Must use -chromThenStart with -chrom" << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedShuffle *bc = new BedShuffle(bedFile, genomeFile, excludeFile, includeFile,
haveSeed, haveExclude, haveInclude, sameChrom,
overlapFraction, seed);
overlapFraction, seed, chooseChrom);
delete bc;
return 0;
}
......@@ -147,10 +156,12 @@ void shuffle_help(void) {
cerr << "\t-incl\t" << "Instead of randomly placing features in a genome, the -incl" << endl;
cerr << "\t\toptions defines a BED/GFF/VCF file of coordinates in which " << endl;
cerr << "\t\tfeatures in -i should be randomly placed (e.g. genes.bed). " << endl << endl;
cerr << "\t\tfeatures in -i should be randomly placed (e.g. genes.bed). " << endl;
cerr << "\t\t- NOTE: must use with -chromThenStart." << endl << endl;
cerr << "\t-chrom\t" << "Keep features in -i on the same chromosome."<< endl;
cerr << "\t\t- By default, the chrom and position are randomly chosen." << endl << endl;
cerr << "\t\t- By default, the chrom and position are randomly chosen." << endl;
cerr << "\t\t- NOTE: must use with -chromThenStart." << endl << endl;
cerr << "\t-seed\t" << "Supply an integer seed for the shuffling." << endl;
cerr << "\t\t- By default, the seed is chosen automatically." << endl;
......@@ -164,6 +175,11 @@ void shuffle_help(void) {
cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl;
cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl;
cerr << "\t-chromFirst\t" << "\n\t\tInstead of choosing a position randomly among the entire" << endl;
cerr << "\t\tgenome, first choose a chrom randomly, and then" << endl;
cerr << "\t\tchoose a random start coordinate on that chrom." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl;
cerr << "\t <chromName><TAB><chromSize>" << endl << endl;
......
......@@ -68,6 +68,8 @@ void GenomeFile::loadGenomeFileIntoMap() {
int size = atoi(genomeFields[1].c_str());
_chromSizes[chrom] = size;
_chromList.push_back(chrom);
_startOffsets.push_back(_genomeLength);
_genomeLength += size;
}
}
else {
......@@ -81,8 +83,21 @@ void GenomeFile::loadGenomeFileIntoMap() {
}
}
int GenomeFile::getChromSize(const string &chrom) {
pair<string, uint32_t> GenomeFile::projectOnGenome(uint32_t genome_pos) {
// search for the chrom that the position belongs on.
// add 1 to genome position b/c of zero-based, half open.
vector<uint32_t>::const_iterator low =
lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1);
// use the iterator to identify the appropriate index
// into the chrom name and start vectors
int i = int(low-_startOffsets.begin());
string chrom = _chromList[i - 1];
uint32_t start = genome_pos - _startOffsets[i - 1];
return make_pair(chrom, start);
}
uint32_t GenomeFile::getChromSize(const string &chrom) {
chromToSizes::const_iterator chromIt = _chromSizes.find(chrom);
if (chromIt != _chromSizes.end())
return _chromSizes[chrom];
......@@ -90,6 +105,10 @@ int GenomeFile::getChromSize(const string &chrom) {
return -1; // chrom not found.
}
uint32_t GenomeFile::getGenomeSize(void) {
return _genomeLength;
}
vector<string> GenomeFile::getChromList() {
return _chromList;
}
......
......@@ -45,11 +45,15 @@ public:
// load a GENOME file into a map keyed by chrom. value is size of chrom.
void loadGenomeFileIntoMap();
pair<string, uint32_t> projectOnGenome(uint32_t genome_pos);
uint32_t getChromSize(const string &chrom); // return the size of a chromosome
uint32_t getGenomeSize(void); // return the total size of the geonome
vector<string> getChromList(); // return a list of chrom names
int getNumberOfChroms(); // return the number of chroms
string getGenomeFileName(); // return the name of the genome file
int getChromSize(const string &chrom); // return the size of a chromosome
vector<string> getChromList(); // return a list of chrom names
int getNumberOfChroms(); // return the number of chroms
string getGenomeFileName(); // return the name of the genome file
......@@ -57,6 +61,11 @@ private:
string _genomeFile;
chromToSizes _chromSizes;
vector<string> _chromList;
// projecting chroms into a single coordinate system
uint32_t _genomeLength;
vector<uint32_t> _startOffsets;
};
#endif /* GENOMEFILE_H */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment