Skip to content
Snippets Groups Projects
Commit 6869dc73 authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Added additional sorting options to sortBed

parent b5265448
No related branches found
No related tags found
No related merge requests found
......@@ -48,7 +48,7 @@ int main(int argc, char* argv[]) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
haveBed = true;
bedFile = argv[i + 1];
i++;
......@@ -57,25 +57,25 @@ int main(int argc, char* argv[]) {
numEntries = true;
i++;
}
else if(PARAMETER_CHECK("-d", 2, parameterLength)) {
haveMaxDistance = true;
maxDistance = atoi(argv[i + 1]);
i++;
}
else if(PARAMETER_CHECK("-d", 2, parameterLength)) {
haveMaxDistance = true;
maxDistance = atoi(argv[i + 1]);
i++;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBed) {
cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl;
showHelp = true;
cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance);
BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance);
bm->MergeBed();
return 0;
}
......@@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
}
void ShowHelp(void) {
cerr << "===============================================" << endl;
cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ;
cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ;
......@@ -94,7 +94,7 @@ void ShowHelp(void) {
cerr << "Description: Merges overlapping bed entries into a sinle interval." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input.bed>" << endl << endl;
cerr << "OPTIONS: " << endl;
cerr << "\t" << "-n\t\t\t" << "Report the number of BED entries that were merged. (=1 if no merging occured)" << endl << endl;
cerr << "\t" << "-d\t\t\t" << "Maximum distance between features allowed for features to be merged. (Default=0)" << endl << endl;
......@@ -105,5 +105,5 @@ void ShowHelp(void) {
// end the program here
exit(1);
}
......@@ -24,9 +24,32 @@ BedSort::BedSort(string &bedFile) {
BedSort::~BedSort(void) {
}
//
// Merge overlapping BED entries into a single entry
//
/*
reportBed
Writes the _original_ BED entry for A.
Works for BED3 - BED6.
*/
void BedSort::reportBed(const BED &a) {
if (bed->bedType == 3) {
cout << a.chrom << "\t" << a.start << "\t" << a.end;
}
else if (bed->bedType == 4) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name;
}
else if (bed->bedType == 5) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name << "\t" << a.score;
}
else if (bed->bedType == 6) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name << "\t" << a.score << "\t" << a.strand;
}
}
void BedSort::SortBed() {
// load the "B" bed file into a map so
......@@ -40,8 +63,169 @@ void BedSort::SortBed() {
vector<BED> bedList = m->second;
for (unsigned int i = 0; i < bedList.size(); ++i) {
cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl;
reportBed(bedList[i]); cout << "\n";
///cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl;
}
}
}
void BedSort::SortBedBySizeAsc() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bed->loadBedFileIntoMapNoBin();
vector<BED> masterList;
masterList.reserve(1000000);
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
// add the entries from this chromosome to the current list
for (unsigned int i = 0; i < m->second.size(); ++i) {
masterList.push_back(m->second[i]);
}
}
// sort the master list by size (asc.)
sort(masterList.begin(), masterList.end(), sortBySizeAsc);
// report the entries in ascending order
for (unsigned int i = 0; i < masterList.size(); ++i) {
reportBed(masterList[i]); cout << "\n";
//cout << masterList[i].chrom << "\t" << masterList[i].start << "\t" << masterList[i].end << endl;
}
}
void BedSort::SortBedBySizeDesc() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bed->loadBedFileIntoMapNoBin();
vector<BED> masterList;
masterList.reserve(1000000);
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
// add the entries from this chromosome to the current list
for (unsigned int i = 0; i < m->second.size(); ++i) {
masterList.push_back(m->second[i]);
}
}
// sort the master list by size (asc.)
sort(masterList.begin(), masterList.end(), sortBySizeDesc);
// report the entries in ascending order
for (unsigned int i = 0; i < masterList.size(); ++i) {
reportBed(masterList[i]); cout << "\n";
//cout << masterList[i].chrom << "\t" << masterList[i].start << "\t" << masterList[i].end << endl;
}
}
void BedSort::SortBedByChromThenSizeAsc() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
sort(bedList.begin(), bedList.end(), sortBySizeAsc);
for (unsigned int i = 0; i < bedList.size(); ++i) {
reportBed(bedList[i]); cout << "\n";
//cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl;
}
}
}
void BedSort::SortBedByChromThenSizeDesc() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
sort(bedList.begin(), bedList.end(), sortBySizeDesc);
for (unsigned int i = 0; i < bedList.size(); ++i) {
reportBed(bedList[i]); cout << "\n";
//cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl;
}
}
}
void BedSort::SortBedByChromThenScoreAsc() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bed->loadBedFileIntoMapNoBin();
if (bed->bedType >= 5) {
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
sort(bedList.begin(), bedList.end(), sortByScoreAsc);
for (unsigned int i = 0; i < bedList.size(); ++i) {
reportBed(bedList[i]); cout << "\n";
//cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl;
}
}
}
else {
cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl;
exit(1);
}
}
void BedSort::SortBedByChromThenScoreDesc() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bed->loadBedFileIntoMapNoBin();
if (bed->bedType >= 5) {
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
sort(bedList.begin(), bedList.end(), sortByScoreDesc);
for (unsigned int i = 0; i < bedList.size(); ++i) {
reportBed(bedList[i]); cout << "\n";
//cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl;
}
}
}
else {
cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl;
exit(1);
}
}
......@@ -22,16 +22,24 @@ class BedSort {
public:
// constructor
BedSort(string &);
// destructor
~BedSort(void);
void SortBed();
private:
// constructor
BedSort(string &);
// destructor
~BedSort(void);
// write BED to stdout
void reportBed(const BED &);
void SortBed(); // the default. sorts by chrom (asc.) then by start (asc.)
void SortBedBySizeAsc();
void SortBedBySizeDesc();
void SortBedByChromThenSizeAsc();
void SortBedByChromThenSizeDesc();
void SortBedByChromThenScoreAsc();
void SortBedByChromThenScoreDesc();
private:
string bedFile;
// instance of a bed file class.
......
......@@ -23,7 +23,15 @@ int main(int argc, char* argv[]) {
// input files
string bedFile;
bool haveBed = false;
int sortChoices = 0;
bool sortBySizeAsc = false;
bool sortBySizeDesc = false;
bool sortByChromThenSizeAsc = false;
bool sortByChromThenSizeDesc = false;
bool sortByChromThenScoreAsc = false;
bool sortByChromThenScoreDesc = false;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
......@@ -45,26 +53,82 @@ int main(int argc, char* argv[]) {
int parameterLength = (int)strlen(argv[i]);
if(argv[i]) {
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
haveBed = true;
bedFile = argv[i];
bedFile = argv[i + 1];
i++;
}
else if(PARAMETER_CHECK("-sizeA", 6, parameterLength)) {
sortBySizeAsc = true;
sortChoices++;
i++;
}
else if(PARAMETER_CHECK("-sizeD", 6, parameterLength)) {
sortBySizeDesc = true;
sortChoices++;
i++;
}
else if(PARAMETER_CHECK("-chrThenSizeA", 13, parameterLength)) {
sortByChromThenSizeAsc = true;
sortChoices++;
i++;
}
else if(PARAMETER_CHECK("-chrThenSizeD", 13, parameterLength)) {
sortByChromThenSizeDesc = true;
sortChoices++;
i++;
}
else if(PARAMETER_CHECK("-chrThenScoreA", 14, parameterLength)) {
sortByChromThenScoreAsc = true;
sortChoices++;
i++;
}
else if(PARAMETER_CHECK("-chrThenScoreD", 14, parameterLength)) {
sortByChromThenScoreDesc = true;
sortChoices++;
i++;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBed) {
cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl;
showHelp = true;
cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl;
showHelp = true;
}
if (sortChoices > 1) {
cerr << endl << "*****" << endl << "*****ERROR: Sorting options are mutually exclusive. Please choose just one. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedSort *bm = new BedSort(bedFile);
bm->SortBed();
if (sortBySizeAsc) {
bm->SortBedBySizeAsc();
}
else if (sortBySizeDesc) {
bm->SortBedBySizeDesc();
}
else if (sortByChromThenSizeAsc) {
bm->SortBedByChromThenSizeAsc();
}
else if (sortByChromThenSizeDesc) {
bm->SortBedByChromThenSizeDesc();
}
else if (sortByChromThenScoreAsc) {
bm->SortBedByChromThenScoreAsc();
}
else if (sortByChromThenScoreDesc) {
bm->SortBedByChromThenScoreDesc();
}
else {
bm->SortBed();
}
return 0;
}
else {
......@@ -73,17 +137,27 @@ int main(int argc, char* argv[]) {
}
void ShowHelp(void) {
cerr << "===============================================" << endl;
cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ;
cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ;
cerr << " Hall Laboratory, University of Virginia" << endl;
cerr << "===============================================" << endl << endl;
cerr << "Description: Sorts a BED file by chrom, then by start position." << endl << endl;
cerr << "***NOTE: Only BED3 - BED6 formats allowed.***"<< endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " <input.bed>" << endl << endl;
cerr << "Description: Sorts a BED file in various and useful ways." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input.bed>" << endl << endl;
cerr << "OPTIONS: " << endl;
cerr << "\t" << "-sizeA\t\t" << "Sort the BED file by feature size in ascending order. Sorts across all chromosomes." << endl << endl;
cerr << "\t" << "-sizeD\t\t" << "Sort the BED file by feature size in descending order. Sorts across all chromosomes." << endl << endl;
cerr << "\t" << "-chrThenSizeA\t" << "Sort the BED file by chrom (ascending), then feature size in ascending order." << endl << endl;
cerr << "\t" << "-chrThenSizeD\t" << "Sort the BED file by chrom (ascending), then feature size in descending order." << endl << endl;
cerr << "\t" << "-chrThenScoreA\t" << "Sort the BED file by chrom (ascending), then score in ascending order." << endl << endl;
cerr << "\t" << "-chrThenScoreD\t" << "Sort the BED file by chrom (ascending), then scor size in descending order." << endl << endl;
cerr << "NOTES: " << endl;
cerr << "\t" << "-i stdin\t\t" << "Allows BED file A to be read from stdin. E.g.: cat a.bed | sortBed -i stdin" << endl << endl;
cerr << "\t***Only BED3 - BED6 formats allowed.***"<< endl << endl;
// end the program here
exit(1);
}
......@@ -68,29 +68,6 @@ int max(const int a, int b) {
}
}
//*********************************************
// Sorting functions
//*********************************************
bool sortByChrom(BED const & a, BED const & b){
if (a.chrom < b.chrom) return true;
else return false;
};
bool sortByStart(const BED &a, const BED &b){
if (a.start < b.start) return true;
else return false;
};
bool byChromThenStart(BED const & a, BED const & b){
if (a.chrom < b.chrom) return true;
else if (a.chrom > b.chrom) return false;
if (a.start < b.start) return true;
else if (a.start >= b.start) return false;
};
//************************************************
// Exception checking
//************************************************
......@@ -120,7 +97,58 @@ static int getBin(int start, int end)
return 0;
}
//*********************************************
// Sorting functions
//*********************************************
bool sortByChrom(BED const & a, BED const & b){
if (a.chrom < b.chrom) return true;
else return false;
};
bool sortByStart(const BED &a, const BED &b){
if (a.start < b.start) return true;
else return false;
};
bool sortBySizeAsc(const BED &a, const BED &b){
unsigned int aLen = a.end - a.start;
unsigned int bLen = b.end - b.start;
if (aLen < bLen) return true;
else return false;
};
bool sortBySizeDesc(const BED &a, const BED &b){
unsigned int aLen = a.end - a.start;
unsigned int bLen = b.end - b.start;
if (aLen > bLen) return true;
else return false;
};
bool sortByScoreAsc(const BED &a, const BED &b){
if (a.score < b.score) return true;
else return false;
};
bool sortByScoreDesc(const BED &a, const BED &b){
if (a.score > b.score) return true;
else return false;
};
bool byChromThenStart(BED const & a, BED const & b){
if (a.chrom < b.chrom) return true;
else if (a.chrom > b.chrom) return false;
if (a.start < b.start) return true;
else if (a.start >= b.start) return false;
return false;
};
void BedFile::binKeeperFind(map<int, vector<BED>, std::less<int> > &bk, const int start, const int end, vector<BED> &hits)
/*
......
......@@ -59,6 +59,15 @@ std::string ToString(const T & value)
void Tokenize(const string& str, vector<string>& tokens);
// BED Sorting Methods
bool sortByChrom(BED const &, BED const &);
bool sortByStart(const BED &, const BED &);
bool sortBySizeAsc(const BED &, const BED &);
bool sortBySizeDesc(const BED &, const BED &);
bool sortByScoreAsc(const BED &, const BED &);
bool sortByScoreDesc(const BED &, const BED &);
bool byChromThenStart(BED const &, BED const &);
//*************************************************
// Common typedefs
//*************************************************
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment