diff --git a/src/fastaFromBed/fastaFromBed.cpp b/src/fastaFromBed/fastaFromBed.cpp index abb7105ce55533ae71c9e81c3186fbb9debae599..931ebb86bce394ccc260a94dec5d9d4dcc0fd917 100755 --- a/src/fastaFromBed/fastaFromBed.cpp +++ b/src/fastaFromBed/fastaFromBed.cpp @@ -12,7 +12,8 @@ #include "lineFileUtilities.h" #include "fastaFromBed.h" -Bed2Fa::Bed2Fa(bool &useName, string &dbFile, string &bedFile, string &fastaOutFile, bool &useFasta) { +Bed2Fa::Bed2Fa(bool &useName, string &dbFile, string &bedFile, string &fastaOutFile, + bool &useFasta, bool &useStrand) { if (useName) { this->useName = true; @@ -22,7 +23,8 @@ Bed2Fa::Bed2Fa(bool &useName, string &dbFile, string &bedFile, string &fastaOutF this->bedFile = bedFile; this->fastaOutFile = fastaOutFile; this->useFasta = useFasta; - + this->useStrand = useStrand; + this->bed = new BedFile(this->bedFile); } @@ -78,17 +80,35 @@ void Bed2Fa::ExtractDNA() { unsigned int start = bedList[i].start; unsigned int end = bedList[i].end; - if ( (start <= currDNA.size()) && (end <= currDNA.size()) ) { + if ( (start <= currDNA.size()) && (end <= currDNA.size()) ) { + string dna = currDNA.substr(bedList[i].start, ((bedList[i].end - bedList[i].start))); - + // revcomp if necessary. Thanks to Thomas Doktor. + if ((this->useStrand == true) && (bedList[i].strand == "-")) { + reverseComplement(dna); + } + if (!(this->useName)) { if (this->useFasta == true) { - faOut << ">" << currChrom << ":" - << bedList[i].start << "-" << bedList[i].end << endl << dna << endl; + if (this->useStrand == true) { + faOut << ">" << currChrom << ":" << bedList[i].start << "-" + << bedList[i].end << "(" << bedList[i].strand << ")" << endl << dna << endl; + } + else { + faOut << ">" << currChrom << ":" << bedList[i].start << "-" + << bedList[i].end << endl << dna << endl; + } } else { - faOut << currChrom << ":" << bedList[i].start << "-" << bedList[i].end - << "\t" << dna << endl; + if (this->useStrand == true) { + faOut << currChrom << ":" << bedList[i].start << "-" + << bedList[i].end << "(" << bedList[i].strand << ")" + << "\t" << dna << endl; + } + else { + faOut << currChrom << ":" << bedList[i].start << "-" << bedList[i].end + << "\t" << dna << endl; + } } } else { @@ -120,18 +140,36 @@ void Bed2Fa::ExtractDNA() { unsigned int start = bedList[i].start; unsigned int end = bedList[i].end; - - if ( (start <= currDNA.size()) && (end <= currDNA.size()) ) { + + if ( (start <= currDNA.size()) && (end <= currDNA.size()) ) { + string dna = currDNA.substr(bedList[i].start, ((bedList[i].end - bedList[i].start))); - + // revcomp if necessary. Thanks to Thomas Doktor. + if ((this->useStrand == true) && (bedList[i].strand == "-")) { + reverseComplement(dna); + } + if (!(this->useName)) { if (this->useFasta == true) { - faOut << ">" << currChrom << ":" - << bedList[i].start << "-" << bedList[i].end << endl << dna << endl; + if (this->useStrand == true) { + faOut << ">" << currChrom << ":" << bedList[i].start << "-" + << bedList[i].end << "(" << bedList[i].strand << ")" << endl << dna << endl; + } + else { + faOut << ">" << currChrom << ":" << bedList[i].start << "-" + << bedList[i].end << endl << dna << endl; + } } else { - faOut << currChrom << ":" << bedList[i].start << "-" << bedList[i].end - << "\t" << dna << endl; + if (this->useStrand == true) { + faOut << currChrom << ":" << bedList[i].start << "-" + << bedList[i].end << "(" << bedList[i].strand << ")" + << "\t" << dna << endl; + } + else { + faOut << currChrom << ":" << bedList[i].start << "-" << bedList[i].end + << "\t" << dna << endl; + } } } else { @@ -142,10 +180,10 @@ void Bed2Fa::ExtractDNA() { faOut << bedList[i].name << "\t" << dna << endl; } } - } else cerr << "Feature (" << bedList[i].chrom << ":" << start << "-" << end << ") beyond " << currChrom << " size (" << currDNA.size() << " bp). Skipping." << endl; + } currDNA = ""; } diff --git a/src/fastaFromBed/fastaFromBed.h b/src/fastaFromBed/fastaFromBed.h index 4593d94d024160cce261320659ce652e4bf63d96..1b62f3675dd599d2c13a09f70674582f13626f30 100755 --- a/src/fastaFromBed/fastaFromBed.h +++ b/src/fastaFromBed/fastaFromBed.h @@ -28,7 +28,8 @@ class Bed2Fa { public: // constructor - Bed2Fa(bool &, string &, string &, string &, bool &); + Bed2Fa(bool &useName, string &dbFile, string &bedFile, string &fastaOutFile, + bool &useFasta, bool &useStrand); // destructor ~Bed2Fa(void); @@ -42,6 +43,7 @@ private: string bedFile; string fastaOutFile; bool useFasta; + bool useStrand; // instance of a bed file class. BedFile *bed; diff --git a/src/fastaFromBed/fastaFromBedMain.cpp b/src/fastaFromBed/fastaFromBedMain.cpp index 5d48d80c1e9c17000e13b3590faaf30d2ff096a1..1a3e0eb31c7619c243b92e8aaa40cdcec854665e 100755 --- a/src/fastaFromBed/fastaFromBedMain.cpp +++ b/src/fastaFromBed/fastaFromBedMain.cpp @@ -42,6 +42,7 @@ int main(int argc, char* argv[]) { bool haveFastaOut = false; bool useNameOnly = false; bool useFasta = true; + bool useStrand = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -89,6 +90,9 @@ int main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-tab", 4, parameterLength)) { useFasta = false; } + else if(PARAMETER_CHECK("-s", 2, parameterLength)) { + useStrand = true; + } else { cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -101,7 +105,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { - Bed2Fa *b2f = new Bed2Fa(useNameOnly, fastaDbFile, bedFile, fastaOutFile, useFasta); + Bed2Fa *b2f = new Bed2Fa(useNameOnly, fastaDbFile, bedFile, fastaOutFile, useFasta, useStrand); b2f->ExtractDNA(); return 0; } @@ -125,8 +129,13 @@ void ShowHelp(void) { cerr << "\t-bed\tBED file of ranges to extract from -fi" << endl; cerr << "\t-fo\tOutput file (can be FASTA or TAB-delimited)" << endl; cerr << "\t-name\tUse the BED name field (#4) for the FASTA header" << endl; + cerr << "\t-tab\tWrite output in TAB delimited format." << endl; - cerr << "\t\tDefault is FASTA format." << endl; + cerr << "\t\t- Default is FASTA format." << endl << endl; + + cerr << "\t-s\tForce strandedness. If the feature occupies the antisense strand," << endl; + cerr << "\t\tthe sequence will be reverse complemented." << endl; + cerr << "\t\t- By default, strand information is ignored." << endl << endl; diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index afee563c5e96955d23062effc0cd86d93a9fa822..641ae999553c83e2179f96eb1c270ddc17a721f0 100755 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -17,23 +17,25 @@ Constructor */ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, float overlapFraction, + bool writeA, bool writeB, bool writeOverlap, float overlapFraction, bool noHit, bool writeCount, bool forceStrand, bool reciprocal, bool bamInput, bool bamOutput) { - this->bedAFile = bedAFile; - this->bedBFile = bedBFile; - this->anyHit = anyHit; - this->noHit = noHit; - this->writeA = writeA; - this->writeB = writeB; - this->writeCount = writeCount; + this->bedAFile = bedAFile; + this->bedBFile = bedBFile; + this->anyHit = anyHit; + this->noHit = noHit; + this->writeA = writeA; + this->writeB = writeB; + this->writeOverlap = writeOverlap; + this->writeCount = writeCount; this->overlapFraction = overlapFraction; - this->forceStrand = forceStrand; - this->reciprocal = reciprocal; - this->bamInput = bamInput; - this->bamOutput = bamOutput; + this->forceStrand = forceStrand; + this->reciprocal = reciprocal; + this->bamInput = bamInput; + this->bamOutput = bamOutput; + // create new BED file objects for A and B this->bedA = new BedFile(bedAFile); this->bedB = new BedFile(bedBFile); } @@ -80,7 +82,7 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { hitsFound = true; numOverlaps++; // we have another hit for A - +/* if (!writeB && printable) { if (writeA) bedA->reportBedNewLine(a); else bedA->reportBedRangeNewLine(a,s,e); @@ -95,6 +97,30 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { bedB->reportBedNewLine(*h); } } +*/ + // new + if (printable == true) { + if (writeA == false && writeB == false && writeOverlap == false) { + bedA->reportBedRangeNewLine(a,s,e); + } + else if (writeA == true && writeB == true) { + bedA->reportBedTab(a); + bedB->reportBedNewLine(*h); + } + else if (writeA == true) { + bedA->reportBedNewLine(a); + } + else if (writeB == true) { + bedA->reportBedRangeTab(a,s,e); + bedB->reportBedNewLine(*h); + } + else if (writeOverlap == true) { + bedA->reportBedTab(a); + bedB->reportBedTab(*h); + printf("%d\n", overlapBases); + } + } + } else { // the user wants there to be sufficient reciprocal overlap int bLength = (h->end - h->start); diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index 656e45f9d0a4946d17e069cd7ce698bcbd6ba5e1..29d6a9ddaf0e9306d99e4af62ea8cae44f627027 100755 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -34,7 +34,7 @@ public: // constructor BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, float overlapFraction, + bool writeA, bool writeB, bool writeOverlap, float overlapFraction, bool noHit, bool writeCount, bool forceStrand, bool reciprocal, bool bamInput, bool bamOutput); @@ -63,6 +63,7 @@ private: bool writeA; bool writeB; bool writeCount; + bool writeOverlap; bool forceStrand; bool reciprocal; float overlapFraction; diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index 270986945e2570e1c87543da54cef4c60350ec84..fce7714715fa5af7a2e93242a6e3e5acd56e977d 100755 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -36,18 +36,19 @@ int main(int argc, char* argv[]) { // input arguments float overlapFraction = 1E-9; - bool haveBedA = false; - bool haveBedB = false; - bool noHit = false; - bool anyHit = false; - bool writeA = false; - bool writeB = false; - bool writeCount = false; - bool haveFraction = false; + bool haveBedA = false; + bool haveBedB = false; + bool noHit = false; + bool anyHit = false; + bool writeA = false; + bool writeB = false; + bool writeCount = false; + bool writeOverlap = false; + bool haveFraction = false; bool reciprocalFraction = false; - bool forceStrand = false; - bool inputIsBam = false; - bool outputIsBam = true; + bool forceStrand = false; + bool inputIsBam = false; + bool outputIsBam = true; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -110,6 +111,9 @@ int main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { writeB = true; } + else if(PARAMETER_CHECK("-wo", 3, parameterLength)) { + writeOverlap = true; + } else if(PARAMETER_CHECK("-c", 2, parameterLength)) { writeCount = true; } @@ -143,6 +147,21 @@ int main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -c, not both." << endl << "*****" << endl; showHelp = true; } + + if (writeCount && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeA && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wa OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeB && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } if (reciprocalFraction && !haveFraction) { cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; @@ -153,11 +172,21 @@ int main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; showHelp = true; } + + if (anyHit && writeB) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wb, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } if (!showHelp) { - BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, + BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, overlapFraction, noHit, writeCount, forceStrand, reciprocalFraction, inputIsBam, outputIsBam); bi->DetermineBedInput(); @@ -186,18 +215,24 @@ void ShowHelp(void) { cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; cerr << "\t-wa\t" << "Write the original entry in A for each overlap." << endl << endl; + cerr << "\t-wb\t" << "Write the original entry in B for each overlap." << endl; - cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f." << endl << endl; + cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f and -r." << endl << endl; + cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; - cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl << endl; - + cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; - cerr << "\t\t- Overlaps restricted by -f." << endl << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; - cerr << "\t\t- Similar to \"grep -v.\"" << endl << endl; + cerr << "\t\t- Similar to \"grep -v\" (an homage)." << endl << endl; cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; @@ -206,7 +241,6 @@ void ShowHelp(void) { cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; - cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; cerr << "\t\toverlap A on the same strand." << endl;