Skip to content
Snippets Groups Projects
Commit 35ecc09c authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Initial BEDTools Commit

parents
No related branches found
No related tags found
No related merge requests found
Showing
with 598 additions and 0 deletions
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
CXX = g++
CXXFLAGS = -O3
LDFLAGS = -m64
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= complementMain.cpp complementBed.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= complementBed
LIBS=
all: $(PROGRAM)
.PHONY: all
$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
@echo " * linking $(PROGRAM)"
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ $^ $(LIBS)
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
.PHONY: clean
\ No newline at end of file
#include "complementBed.h"
//==========================
// Constructor
//
BedComplement::BedComplement(string &bedFile, string &genomeFile) {
this->bedFile = bedFile;
this->genomeFile = genomeFile;
this->bed = new BedFile(bedFile);
}
//
// Destructor
//
BedComplement::~BedComplement(void) {
}
//
// Merge overlapping BED entries into a single entry
//
void BedComplement::ComplementBed() {
// open the GENOME file for reading
ifstream genome(this->genomeFile.c_str(), ios::in);
if ( !genome ) {
cerr << "Error: The requested genome file (" <<this->genomeFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
string chrom;
unsigned int size;
map<string, int, less<string> > chromSizes;
while (genome >> chrom >> size) {
chromSizes[chrom] = size;
}
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bed->loadBedFileIntoMapNoBin();
string currChrom;
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) {
currChrom = m->first;
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
int minStart = INT_MAX;
int maxEnd = 0;
bool OIP = false; // OIP = Overlap In Progress. Lame, I realize.
int i;
int mergeCount = 1;
bool firstSegment = true;
// loop through the BED entries for this chromosome
// and look for overlaps
for (i = 1; i < bedList.size(); ++i) {
// Are the current and previous entries within the maximum distance allowed by the user?
// By default, maxDistance is 0, which allows for the following to be reported:
// OVERLAP BOOK-END
// ******** ***************
// ******* ***************
// Ans. ************** ******************************
if ( overlaps(bedList[i-1].start, bedList[i-1].end,
bedList[i].start, bedList[i].end)
>= 0 ) {
OIP = true;
mergeCount++;
minStart = min(bedList[i-1].start, min(minStart, bedList[i].start));
maxEnd = max(bedList[i-1].end, max(maxEnd, bedList[i].end));
}
else if ( overlaps(minStart, maxEnd,
bedList[i].start, bedList[i].end)
>= 0 ) {
mergeCount++;
minStart = min(minStart, bedList[i].start);
maxEnd = max(maxEnd, bedList[i].end);
}
// either an overlap was broken or we have an
// interstice between two non-overlapping features
else {
// was there an overlap before the current entry broke it?
// if so, report the complement as the position after the end of the
// merged segment to the position before the start of the current segment.
// For example:
// ***************** *******************
// ******************
// Complement: !!!!!!!!!!
if (OIP) {
if (firstSegment) {
cout << currChrom << "\t" << 0 << "\t" << minStart << endl;
firstSegment = false;
}
cout << currChrom << "\t" << maxEnd << "\t" << bedList[i].start << endl;
}
// otherwise report the interstice between the current and previous feature.
// For example:
// ***************** *******************
// Complement: !!!!!!!!!!!!!!!!!!
else {
if (firstSegment) {
cout << currChrom << "\t" << 0 << "\t" << bedList[i-1].start << endl;
firstSegment = false;
}
cout << currChrom << "\t" << bedList[i-1].end << "\t" << bedList[i].start << endl;
}
// reset things for the next overlapping "block"
OIP = false;
mergeCount = 1;
minStart = INT_MAX;
maxEnd = 0;
}
}
// clean up based on the last entry for the current chromosome
// same as above code...
if (OIP) {
if (firstSegment) {
cout << currChrom << "\t" << 0 << "\t" << minStart << endl;
firstSegment = false;
}
cout << currChrom << "\t" << maxEnd << "\t" << chromSizes[currChrom] << endl;
}
else {
if (firstSegment) {
cout << currChrom << "\t" << 0 << "\t" << bedList[i-1].start-1 << endl;
firstSegment = false;
}
cout << currChrom << "\t" << bedList[i-1].end << "\t" << chromSizes[currChrom] << endl;
}
}
}
#include "bedFile.h"
#include <vector>
#include <algorithm>
#include <map>
#include <list>
#include <iostream>
#include <fstream>
#include <limits>
using namespace std;
//***********************************************
// Typedefs
//***********************************************
typedef list<BED> bedList;
//************************************************
// Class methods and elements
//************************************************
class BedComplement {
public:
// constructor
BedComplement(string &, string &);
// destructor
~BedComplement(void);
void ComplementBed();
private:
string bedFile;
string genomeFile;
// instance of a bed file class.
BedFile *bed;
};
#include <iostream>
#include "complementBed.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "complementBed"
// define the version
#define VERSION "1.1.0"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void ShowHelp(void);
int main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedFile;
string genomeFile;
bool haveBed = false;
bool haveGenome = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-h", 2, parameterLength) ||
PARAMETER_CHECK("--help", 5, parameterLength)) {
showHelp = true;
}
}
if(showHelp) ShowHelp();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
haveBed = true;
bedFile = argv[i + 1];
i++;
}
else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
haveGenome = true;
genomeFile = argv[i + 1];
i++;
}
else {
cout << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBed || !haveGenome) {
cout << endl << "*****" << endl << "*****ERROR: Need -i BED file and -g Genome file. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedComplement *bc = new BedComplement(bedFile, genomeFile);
bc->ComplementBed();
return 0;
}
else {
ShowHelp();
}
}
void ShowHelp(void) {
cout << "=======================================================" << endl;
cout << PROGRAM_NAME << " v" << VERSION << endl ;
cout << "Aaron Quinlan, Ph.D." << endl;
cout << "aaronquinlan@gmail.com" << endl ;
cout << "Hall Laboratory" << endl;
cout << "Biochemistry and Molecular Genetics" << endl;
cout << "University of Virginia" << endl;
cout << "=======================================================" << endl << endl;
cout << "Description: Returns the base pair complement of a BED file." << endl << endl;
cout << "***NOTE: Only BED3 - BED6 formats allowed.***"<< endl << endl;
cout << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input.bed>" << endl << endl;
//cout << "OPTIONS: " << endl;
//cout << "\t" << "-n\t\t\t" << "Report the number of BED entries that were merged. (=1 if no merging occured)" << endl;
//cout << "\t" << "-d\t\t\t" << "Maximum distance between features that will be merged. (Default is 0)" << endl;
//cout << "\t\t\t\t" << "For example, \"-d 50\" will merge features that are <= 50 bp apart." << endl << endl;
// end the program here
exit(1);
}
CXX = g++
CXXFLAGS = -O3
LDFLAGS = -m64
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= coverageMain.cpp coverageBed.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= coverageBed
LIBS=
all: $(PROGRAM)
.PHONY: all
$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
@echo " * linking $(PROGRAM)"
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ $^ $(LIBS)
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
.PHONY: clean
#include "graphBed.h"
BedGraph::BedGraph(string &bedAFile, string &bedBFile) {
this->bedAFile = bedAFile;
this->bedBFile = bedBFile;
this->bedA = new BedFile(bedAFile);
this->bedB = new BedFile(bedBFile);
}
BedGraph::~BedGraph(void) {
}
void BedGraph::GraphBed() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bedB->loadBedFileIntoMap();
string bedLine;
BED bedEntry;
int lineNum = 0;
ifstream bed(bedA->bedFile.c_str(), ios::in);
if ( !bed ) {
cerr << "Error: The requested bed file (" <<bedA->bedFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
BED a;
while (getline(bed, bedLine)) {
vector<string> bedFields;
Tokenize(bedLine,bedFields);
lineNum++;
// process the feature in A IFF it's valid.
if (bedA->parseBedLine(a, bedFields, lineNum)) {
// increment the count of overlaps for each feature in B that
// overlaps the current A interval
bedB->countHits(bedB->bedMap[a.chrom], a.start, a.end);
}
}
// now, report the count of hist for each feature in B.
for (masterBedMap::iterator c = bedB->bedMap.begin(); c != bedB->bedMap.end(); ++c) {
map<int, vector<BED> > bin2Beds = c->second;
for (map<int, vector<BED> >::iterator b = bin2Beds.begin(); b != bin2Beds.end(); ++b) {
vector<BED> beds = b->second;
for (int i = 0; i < beds.size(); i++) {
cout << c->first << "\t" << beds[i].start << "\t" << beds[i].end << "\t" << beds[i].count << endl;
}
}
}
}
#ifndef GRAPHBED_H
#define GRAPHBED_H
#include "bedFile.h"
#include <vector>
#include <map>
#include <list>
#include <iostream>
#include <fstream>
using namespace std;
//***********************************************
// Typedefs
//***********************************************
typedef list<BED> bedList;
//************************************************
// Class methods and elements
//************************************************
class BedGraph {
public:
// constructor
BedGraph(string &, string &);
// destructor
~BedGraph(void);
void GraphBed();
private:
string bedAFile;
string bedBFile;
// instance of a bed file class.
BedFile *bedA, *bedB;
};
#endif /* GRAPHBED_H */
#include "graphBed.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "graphBed"
// define the version
#define VERSION "1.1.0"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void ShowHelp(void);
int main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedAFile;
string bedBFile;
bool haveBedA = false;
bool haveBedB = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-h", 2, parameterLength) ||
PARAMETER_CHECK("--help", 5, parameterLength)) {
showHelp = true;
}
}
if(showHelp) ShowHelp();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-a", 2, parameterLength)) {
haveBedA = true;
bedAFile = argv[i + 1];
i++;
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
haveBedB = true;
bedBFile = argv[i + 1];
i++;
}
else {
cout << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBedA || !haveBedB) {
cout << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedGraph *bg = new BedGraph(bedAFile, bedBFile);
bg->GraphBed();
return 0;
}
else {
ShowHelp();
}
}
void ShowHelp(void) {
cout << "=======================================================" << endl;
cout << PROGRAM_NAME << " v" << VERSION << endl ;
cout << "Aaron Quinlan, Ph.D." << endl;
cout << "aaronquinlan@gmail.com" << endl ;
cout << "Hall Laboratory" << endl;
cout << "Biochemistry and Molecular Genetics" << endl;
cout << "University of Virginia" << endl;
cout << "=======================================================" << endl << endl;
//cout << "Description: Reportoverlaps between a.bed and b.bed." << endl << endl;
cout << "***NOTE: Only BED3 - BED6 formats allowed.***"<< endl << endl;
cout << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl;
//cout << "OPTIONS: " << endl;
//cout << "\t" << "-u\t\t\t" << "Write ORIGINAL a.bed entry ONCE if ANY overlap bed." << endl << "\t\t\t\tIn other words, ignore multiple overlaps." << endl << endl;
//cout << "\t" << "-v \t\t\t" << "Only report those entries in A that have NO OVERLAP in B." << endl << "\t\t\t\tSimilar to grep -v." << endl << endl;
//cout << "\t" << "-f (e.g. 0.05)\t\t" << "Minimum overlap req'd as fraction of a.bed." << endl << "\t\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl;
//cout << "\t" << "-c \t\t\t" << "For each entry in A, report the number of hits in B while restricting to -f." << endl << "\t\t\t\tReports 0 for A entries that have no overlap with B." << endl << endl;
//cout << "\t" << "-wb \t\t\t" << "Write the entry in B for each overlap." << endl << "\t\t\t\tUseful for knowing _what_ A overlaps. Restricted by -f." << endl << endl;
// end the program here
exit(1);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment