Skip to content
Snippets Groups Projects
Commit 425bdeaa authored by Aaron's avatar Aaron
Browse files

added new jaccard tool. see PMID: 22693437

parent 1549933f
No related branches found
No related tags found
No related merge requests found
......@@ -43,6 +43,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/getOverlap \
$(SRC_DIR)/groupBy \
$(SRC_DIR)/intersectBed \
$(SRC_DIR)/jaccard \
$(SRC_DIR)/linksBed \
$(SRC_DIR)/maskFastaFromBed \
$(SRC_DIR)/mapBed \
......
......@@ -52,6 +52,7 @@ int genomecoverage_main(int argc, char* argv[]);//
int getoverlap_main(int argc, char* argv[]);//
int groupby_main(int argc, char* argv[]);//
int intersect_main(int argc, char* argv[]); //
int jaccard_main(int argc, char* argv[]); //
int links_main(int argc, char* argv[]);//
int maskfastafrombed_main(int argc, char* argv[]);//
int map_main(int argc, char* argv[]); //
......@@ -123,6 +124,9 @@ int main(int argc, char *argv[])
else if (sub_cmd == "maskfasta") return maskfastafrombed_main(argc-1, argv+1);
else if (sub_cmd == "nuc") return nuc_main(argc-1, argv+1);
// statistics tools
else if (sub_cmd == "jaccard") return jaccard_main(argc-1, argv+1);
// misc. tools
else if (sub_cmd == "overlap") return getoverlap_main(argc-1, argv+1);
else if (sub_cmd == "igv") return bedtoigv_main(argc-1, argv+1);
......@@ -131,6 +135,8 @@ int main(int argc, char *argv[])
else if (sub_cmd == "groupby") return groupby_main(argc-1, argv+1);
else if (sub_cmd == "expand") return expand_main(argc-1, argv+1);
// help
else if (sub_cmd == "-h" || sub_cmd == "--help" ||
sub_cmd == "-help")
......@@ -222,6 +228,10 @@ int bedtools_help(void)
cout << "[ BAM focused tools ]" << endl;
cout << " multicov " << "Counts coverage from multiple BAMs at specific intervals.\n";
cout << " tag " << "Tag BAM alignments based on overlaps with interval files.\n";
cout << endl;
cout << "[ Statistics tools ]" << endl;
cout << " jaccard " << "Calculates the Jaccard statistic b/w two sets of intervals.\n";
cout << endl;
cout << "[ Miscellaneous tools ]" << endl;
......
......@@ -288,7 +288,7 @@ void BedIntersect::IntersectBed() {
ChromSweep sweep = ChromSweep(_bedA, _bedB,
_sameStrand, _diffStrand,
_overlapFraction, _reciprocal,
_printHeader);
false, _printHeader);
pair<BED, vector<BED> > hit_set;
hit_set.second.reserve(10000);
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/genomeFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/chromsweep \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= jaccardMain.cpp jaccard.cpp jaccard.h
OBJECTS= jaccardMain.o jaccard.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= jaccard
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/jaccardMain.o $(OBJ_DIR)/jaccard.o
.PHONY: clean
/*****************************************************************************
jaccard.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "jaccard.h"
/************************************
Helper functions
************************************/
size_t Jaccard::GetTotalIntersection(const BED &a, const vector<BED> &hits)
{
size_t intersection = 0;
vector<BED>::const_iterator h = hits.begin();
vector<BED>::const_iterator hitsEnd = hits.end();
for (; h != hitsEnd; ++h) {
CHRPOS s = max(a.start, h->start);
CHRPOS e = min(a.end, h->end);
intersection += (e - s);
}
return intersection;
}
/*
Constructor
*/
Jaccard::Jaccard(string bedAFile, string bedBFile,
float overlapFraction, bool reciprocal)
{
_bedAFile = bedAFile;
_bedBFile = bedBFile;
_overlapFraction = overlapFraction;
_reciprocal = reciprocal;
CalculateJaccard();
}
/*
Destructor
*/
Jaccard::~Jaccard(void) {
}
unsigned long Jaccard::GetUnion() {
// create new BED file objects for A and B
_bedA = new BedFile(_bedAFile);
_bedB = new BedFile(_bedBFile);
unsigned long U = 0;
BED bed;
_bedA->Open();
while (_bedA->GetNextMergedBed(bed)) {
U += bed.end - bed.start;
}
_bedB->Open();
while (_bedB->GetNextMergedBed(bed)) {
U += bed.end - bed.start;
}
return U;
}
unsigned long Jaccard::GetIntersection() {
_bedA = new BedFile(_bedAFile);
_bedB = new BedFile(_bedBFile);
unsigned long I = 0;
ChromSweep sweep = ChromSweep(_bedA, _bedB,
false, false,
_overlapFraction, _reciprocal,
true, false);
pair<BED, vector<BED> > hit_set;
hit_set.second.reserve(10000);
while (sweep.Next(hit_set)) {
I += GetTotalIntersection(hit_set.first, hit_set.second);
}
return I;
}
void Jaccard::CalculateJaccard() {
unsigned long U = GetUnion();
delete _bedA;
delete _bedB;
unsigned long I = GetIntersection();
// header
cout << "intersection\t"
<< "union\t"
<< "jaccard"
<< endl;
// result
cout << I << "\t"
<< U - I << "\t"
<< (float) I / ((float) U - (float) I)
<< endl;
}
/*****************************************************************************
jaccard.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef JACCARD_H
#define JACCARD_H
#include "bedFile.h"
#include "chromsweep.h"
#include "api/BamReader.h"
#include "api/BamAux.h"
#include "BlockedIntervals.h"
#include "BamAncillary.h"
using namespace BamTools;
#include <vector>
#include <iostream>
#include <fstream>
#include <stdlib.h>
using namespace std;
class Jaccard {
public:
// constructor
Jaccard(string bedAFile, string bedBFile,
float overlapFraction, bool reciprocal);
// destructor
~Jaccard(void);
private:
//------------------------------------------------
// private attributes
//------------------------------------------------
string _bedAFile;
string _bedBFile;
bool _reciprocal;
float _overlapFraction;
// instance of a bed file class.
BedFile *_bedA, *_bedB;
//------------------------------------------------
// private methods
//------------------------------------------------
unsigned long GetIntersection();
unsigned long GetUnion();
void CalculateJaccard();
size_t GetTotalIntersection(const BED &a, const vector<BED> &hits);
};
#endif /* JACCARD_H */
/*****************************************************************************
jaccardMain.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "jaccard.h"
#include "version.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "bedtools jaccard"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void jaccard_help(void);
int jaccard_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedAFile;
string bedBFile;
// input arguments
float overlapFraction = 1E-9;
bool haveBedA = false;
bool haveBedB = false;
bool haveFraction = false;
bool reciprocalFraction = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) jaccard_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-a", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedA = true;
bedAFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedB = true;
bedBFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
if ((i+1) < argc) {
haveFraction = true;
overlapFraction = atof(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-r", 2, parameterLength)) {
reciprocalFraction = true;
}
else {
cerr << endl
<< "*****ERROR: Unrecognized parameter: "
<< argv[i]
<< " *****"
<< endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBedA || !haveBedB) {
cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
showHelp = true;
}
if (reciprocalFraction && !haveFraction) {
cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
Jaccard *j = new Jaccard(bedAFile, bedBFile,
overlapFraction, reciprocalFraction);
delete j;
return 0;
}
else {
jaccard_help();
return 0;
}
}
void jaccard_help(void) {
cerr << "\nTool: bedtools jaccard (aka jaccard)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Calculate Jaccard statistic b/w two feature files."
<< endl
<< "\t Jaccard is the length of the intersection over the union."
<< endl
<< "\t Values range from 0 (no intersection) to 1 (self intersection)."
<< endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl;
cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl;
cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl;
cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl;
cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl;
cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) Input files must be sorted by chrom, then start position."
<< endl << endl;
// end the program here
exit(1);
}
......@@ -53,7 +53,7 @@ void BedMap::Map() {
ChromSweep sweep = ChromSweep(_bedA, _bedB,
_sameStrand, _diffStrand,
_overlapFraction, _reciprocal,
_printHeader);
false, _printHeader);
pair<BED, vector<BED> > hit_set;
hit_set.second.reserve(10000);
......
......@@ -115,6 +115,9 @@ void BedFile::Open(void) {
// Rewind the pointer back to the beginning of the file
void BedFile::Rewind(void) {
_bedStream->seekg(0, ios::beg);
_prev_start = -1;
_prev_chrom = "";
}
// Jump to a specific byte in the file
......@@ -129,7 +132,8 @@ bool BedFile::Empty(void) {
// Close the BED file
void BedFile::Close(void) {
if (bedFile != "stdin" && bedFile != "-") delete _bedStream;
if (bedFile != "stdin" && bedFile != "-")
delete _bedStream;
}
void BedFile::GetLine(void) {
......@@ -269,15 +273,18 @@ bool BedFile::GetNextMergedBed(BED &merged_bed) {
}
}
}
// handle the last merged block in the file.
if (_status == BED_INVALID)
{
_status = BED_VALID;
merged_bed.chrom = _merged_chrom;
merged_bed.start = _merged_start;
merged_bed.end = _merged_end;
return true;
}
}
_status = BED_INVALID;
return false;
}
......
......@@ -21,7 +21,7 @@ bool after(const BED &a, const BED &b);
ChromSweep::ChromSweep(BedFile *query, BedFile *db,
bool sameStrand, bool diffStrand,
float overlapFraction, bool reciprocal,
bool printHeader)
bool useMergedIntervals, bool printHeader)
: _query(query)
......@@ -30,6 +30,7 @@ ChromSweep::ChromSweep(BedFile *query, BedFile *db,
, _sameStrand(sameStrand)
, _diffStrand(diffStrand)
, _reciprocal(reciprocal)
, _useMergedIntervals(useMergedIntervals)
{
_hits.reserve(100000);
......@@ -37,8 +38,8 @@ ChromSweep::ChromSweep(BedFile *query, BedFile *db,
if (printHeader) _query->PrintHeader();
_db->Open();
_query->GetNextBed(_curr_qy);
_db->GetNextBed(_curr_db);
NextQuery();
NextDatabase();
}
/*
......@@ -54,11 +55,25 @@ ChromSweep::ChromSweep(string &queryFile, string &dbFile)
_query->Open();
_db->Open();
_query->GetNextBed(_curr_qy);
_db->GetNextBed(_curr_db);
NextQuery();
NextDatabase();
}
bool ChromSweep::NextQuery() {
if (!_useMergedIntervals)
return _query->GetNextBed(_curr_qy, true);
else
return _query->GetNextMergedBed(_curr_qy);
}
bool ChromSweep::NextDatabase() {
if (!_useMergedIntervals)
return _db->GetNextBed(_curr_db, true);
else
return _db->GetNextMergedBed(_curr_db);
}
/*
Destructor
*/
......@@ -92,7 +107,8 @@ bool ChromSweep::ChromChange()
// the query is ahead of the database.
// fast-forward the database to catch-up.
else if ((_curr_qy.chrom > _curr_db.chrom) && (!_db->Empty())) {
while (_db->GetNextBed(_curr_db, true) &&
while (NextDatabase() &&
_curr_db.chrom < _curr_qy.chrom)
{
}
......@@ -114,7 +130,7 @@ bool ChromSweep::ChromChange()
_results.push(make_pair(_curr_qy, _no_hits));
_cache.clear();
}
_query->GetNextBed(_curr_qy, true);
NextQuery();
_curr_chrom = _curr_qy.chrom;
return true;
}
......@@ -168,13 +184,13 @@ bool ChromSweep::Next(pair<BED, vector<BED> > &next) {
_hits.push_back(_curr_db);
}
_cache.push_back(_curr_db);
_db->GetNextBed(_curr_db, true);
NextDatabase();
}
// add the hits for this query to the pump
_results.push(make_pair(_curr_qy, _hits));
// reset for the next query
_hits.clear();
_query->GetNextBed(_curr_qy, true);
NextQuery();
_curr_chrom = _curr_qy.chrom;
}
}
......
......@@ -34,7 +34,7 @@ public:
ChromSweep(BedFile *query, BedFile *db,
bool sameStrand = false, bool diffStrand = false,
float overlapFraction = 0.0, bool reciprocal = false,
bool printHeader = false);
bool useMergedIntervals = false, bool printHeader = false);
// constructor using filenames
ChromSweep(string &queryFile, string &dbFile);
......@@ -64,7 +64,12 @@ private:
BedFile *_query, *_db;
float _overlapFraction;
// do we care about strandedness.
bool _sameStrand, _diffStrand, _reciprocal;
bool _sameStrand;
bool _diffStrand;
// do we care about reciprocal overlap?
bool _reciprocal;
// should we merge overlapping intervals before computing overlaps?
bool _useMergedIntervals;
/*
a cache of still active features from the database file\
......@@ -89,8 +94,10 @@ private:
// private methods.
private:
void ScanCache();
void ScanCache();
bool ChromChange();
bool NextQuery();
bool NextDatabase();
bool IsValidHit(const BED &query, const BED &db);
};
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment