Skip to content
Snippets Groups Projects
Commit a3c9136d authored by Aaron's avatar Aaron
Browse files

new tool: bedtools random

parent 0e3768eb
No related branches found
No related tags found
No related merge requests found
......@@ -41,6 +41,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/nucBed \
$(SRC_DIR)/pairToBed \
$(SRC_DIR)/pairToPair \
$(SRC_DIR)/randomBed \
$(SRC_DIR)/shuffleBed \
$(SRC_DIR)/slopBed \
$(SRC_DIR)/sortBed \
......
......@@ -34,6 +34,7 @@ def main():
'overlap': 'getOverlap',
'pairtobed': 'pairToBed',
'pairtopair': 'pairToPair',
'random': 'randomBed',
'shuffle': 'shuffleBed',
'slop': 'slopBed',
'sort': 'sortBed',
......
......@@ -59,6 +59,7 @@ int multiintersect_main(int argc, char* argv[]);//
int nuc_main(int argc, char* argv[]);//
int pairtobed_main(int argc, char* argv[]);//
int pairtopair_main(int argc, char* argv[]);//
int random_main(int argc, char* argv[]); //
int shuffle_main(int argc, char* argv[]); //
int slop_main(int argc, char* argv[]); //
int sort_main(int argc, char* argv[]); //
......@@ -92,6 +93,7 @@ int main(int argc, char *argv[])
else if (sub_cmd == "slop") return slop_main(argc-1, argv+1);
else if (sub_cmd == "flank") return flank_main(argc-1, argv+1);
else if (sub_cmd == "sort") return sort_main(argc-1, argv+1);
else if (sub_cmd == "random") return random_main(argc-1, argv+1);
else if (sub_cmd == "shuffle") return shuffle_main(argc-1, argv+1);
else if (sub_cmd == "annotate") return annotate_main(argc-1, argv+1);
......@@ -184,6 +186,7 @@ int bedtools_help(void)
cout << " slop " << "Adjust the size of intervals.\n";
cout << " flank " << "Create new intervals from the flanks of existing intervals.\n";
cout << " sort " << "Order the intervals in a file.\n";
cout << " random " << "Generate random intervals in a genome.\n";
cout << " shuffle " << "Randomly redistrubute intervals in a genome.\n";
cout << " annotate " << "Annotate coverage of features from multiple files.\n";
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/genomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= randomBedMain.cpp randomBed.cpp randomBed.h
OBJECTS= randomBedMain.o randomBed.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/randomBedMain.o $(OBJ_DIR)/randomBed.o
.PHONY: clean
/*****************************************************************************
randomBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "randomBed.h"
BedRandom::BedRandom(string &genomeFile, uint32_t numToGenerate, int seed,
bool haveSeed, uint32_t length) {
_genomeFile = genomeFile;
_numToGenerate = numToGenerate;
_length = length;
_haveSeed = haveSeed;
_seed = seed;
// use the supplied seed for the random
// number generation if given. else,
// roll our own.
if (_haveSeed) {
_seed = seed;
srand(seed);
}
else {
// thanks to Rob Long for the tip.
_seed = (unsigned)time(0)+(unsigned)getpid();
srand(_seed);
}
Generate();
}
BedRandom::~BedRandom(void)
{}
void BedRandom::Generate()
{
_genome = new GenomeFile(_genomeFile);
uint32_t genomeSize = _genome->getGenomeSize();
string chrom;
uint32_t start;
uint32_t end;
char strand;
uint32_t chromSize;
uint32_t numGenerated = 0;
while (numGenerated < _numToGenerate)
{
do
{
// we need to combine two consective calls to rand()
// because RAND_MAX is 2^31 (2147483648), whereas
// mammalian genomes are obviously much larger.
uint32_t randStart = ((rand() << 31) | rand()) % genomeSize;
// use the above randomStart (e.g., for human 0..3.1billion)
// to identify the chrom and start on that chrom.
pair<string, int> location = _genome->projectOnGenome(randStart);
chrom = location.first;
start = location.second;
end = start + _length;
chromSize = _genome->getChromSize(location.first);
// keep looking if we have exceeded the end of the chrom.
} while (end > chromSize);
numGenerated++;
// flip a coin for strand
(rand() / double(RAND_MAX)) > 0.5 ? strand = '+' : strand = '-';
printf("%s\t%d\t%d\t%d\t%d\t%c\n", chrom.c_str(), start, end, numGenerated, end-start, strand);
}
}
/*****************************************************************************
randomBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "genomeFile.h"
#include <vector>
#include <iostream>
#include <fstream>
#include <map>
#include <cstdlib>
#include <ctime>
#include <sys/time.h>
#include <unistd.h>
#include <sys/types.h>
#include <algorithm> // for binary search
using namespace std;
const int MAX_TRIES = 1000000;
//************************************************
// Class methods and elements
//************************************************
class BedRandom {
public:
// constructor
BedRandom(string &genomeFile, uint32_t numToGenerate, int seed,
bool haveSeed, uint32_t length);
// destructor
~BedRandom(void);
private:
string _genomeFile;
int _seed;
bool _haveSeed;
GenomeFile *_genome;
uint32_t _length;
uint32_t _numToGenerate;
// methods
void Generate();
};
/*****************************************************************************
randomBedMain.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "randomBed.h"
#include "version.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "bedtools random"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void random_help(void);
int random_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedFile = "stdin";
string genomeFile;
bool haveGenome = false;
bool haveSeed = false;
int seed = -1;
int length = 100;
int numToGenerate = 1000000;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) random_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-g", 2, parameterLength)) {
if ((i+1) < argc) {
haveGenome = true;
genomeFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-seed", 5, parameterLength)) {
if ((i+1) < argc) {
haveSeed = true;
seed = atoi(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-l", 2, parameterLength)) {
if ((i+1) < argc) {
length = atoi(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-n", 2, parameterLength)) {
if ((i+1) < argc) {
numToGenerate = atoi(argv[i + 1]);
i++;
}
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveGenome) {
cerr << endl << "*****" << endl << "*****ERROR: Need a genome (-g) file. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedRandom *br = new BedRandom(genomeFile, numToGenerate, seed, haveSeed, length);
delete br;
return 0;
}
else {
random_help();
}
return 0;
}
void random_help(void) {
cerr << "\nTool: bedtools random (aka randomBed)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Generate random intervals among a genome." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -g <genome>" << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-l\t" << "The length of the intervals to generate." << endl;
cerr << "\t\t- Default = 100." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "\t-n\t" << "The number of intervals to generate." << endl;
cerr << "\t\t- Default = 1,000,000." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "\t-n\t" << "Supply an integer seed for the shuffling." << endl;
cerr << "\t\t- By default, the seed is chosen automatically." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "\t-seed\t" << "Supply an integer seed for the shuffling." << endl;
cerr << "\t\t- By default, the seed is chosen automatically." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl;
cerr << "\t <chromName><TAB><chromSize>" << endl << endl;
cerr << "\tFor example, Human (hg19):" << endl;
cerr << "\tchr1\t249250621" << endl;
cerr << "\tchr2\t243199373" << endl;
cerr << "\t..." << endl;
cerr << "\tchr18_gl000207_random\t4262" << endl << endl;
cerr << "Tips: " << endl;
cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl;
cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl;
cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl;
cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl;
// end the program here
exit(1);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment