From 574b93490d84db51c2f9d786b1c3d5fb9e092386 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Fri, 9 Mar 2012 10:14:50 -0500 Subject: [PATCH] Added -C (case-insensitive) matching to nucBed. --- src/nucBed/nucBed.cpp | 8 +++++--- src/nucBed/nucBed.h | 3 ++- src/nucBed/nucBedMain.cpp | 17 ++++++++++++++--- src/utils/sequenceUtilities/sequenceUtils.cpp | 19 +++++++++++++------ src/utils/sequenceUtilities/sequenceUtils.h | 2 +- 5 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/nucBed/nucBed.cpp b/src/nucBed/nucBed.cpp index acc97c65..446f6a6b 100644 --- a/src/nucBed/nucBed.cpp +++ b/src/nucBed/nucBed.cpp @@ -14,7 +14,9 @@ NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, - bool hasPattern, const string &pattern, bool forceStrand) { + bool hasPattern, const string &pattern, + bool forceStrand, bool ignoreCase) +{ _dbFile = dbFile; _bedFile = bedFile; @@ -22,7 +24,7 @@ NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, _hasPattern = hasPattern; _pattern = pattern; _forceStrand = forceStrand; - + _ignoreCase = ignoreCase; _bed = new BedFile(_bedFile); // Compute the DNA content in each BED/GFF/VCF interval @@ -42,7 +44,7 @@ void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLen getDnaContent(sequence,a,c,g,t,n,other); if (_hasPattern) - userPatternCount = countPattern(sequence, _pattern); + userPatternCount = countPattern(sequence, _pattern, _ignoreCase); // report the original interval diff --git a/src/nucBed/nucBed.h b/src/nucBed/nucBed.h index 2fe40ddf..ba45af4f 100644 --- a/src/nucBed/nucBed.h +++ b/src/nucBed/nucBed.h @@ -31,7 +31,7 @@ public: // constructor NucBed(string &dbFile, string &bedFile, bool printSeq, bool hasPattern, const string &pattern, - bool forceStrand); + bool forceStrand, bool ignoreCase); // destructor ~NucBed(void); @@ -45,6 +45,7 @@ private: bool _hasPattern; string _pattern; bool _forceStrand; + bool _ignoreCase; // instance of a bed file class. BedFile *_bed; diff --git a/src/nucBed/nucBedMain.cpp b/src/nucBed/nucBedMain.cpp index c2e4a523..eff8bec3 100644 --- a/src/nucBed/nucBedMain.cpp +++ b/src/nucBed/nucBedMain.cpp @@ -40,6 +40,7 @@ int nuc_main(int argc, char* argv[]) { bool printSeq = false; bool hasPattern = false; bool forceStrand = false; + bool ignoreCase = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -80,6 +81,9 @@ int nuc_main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-s", 2, parameterLength)) { forceStrand = true; } + else if(PARAMETER_CHECK("-C", 2, parameterLength)) { + ignoreCase = true; + } else if(PARAMETER_CHECK("-pattern", 8, parameterLength)) { if ((i+1) < argc) { hasPattern = true; @@ -99,7 +103,8 @@ int nuc_main(int argc, char* argv[]) { if (!showHelp) { - NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq, hasPattern, pattern, forceStrand); + NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq, + hasPattern, pattern, forceStrand, ignoreCase); delete nuc; } else { @@ -117,13 +122,19 @@ void nuc_help(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -fi <fasta> -bed <bed/gff/vcf>" << endl << endl; cerr << "Options: " << endl; + cerr << "\t-fi\tInput FASTA file" << endl << endl; + cerr << "\t-bed\tBED/GFF/VCF file of ranges to extract from -fi" << endl << endl; + cerr << "\t-s\tProfile the sequence according to strand." << endl << endl; + cerr << "\t-seq\tPrint the extracted sequence" << endl << endl; + cerr << "\t-pattern\tReport the number of times a user-defined sequence" << endl; - cerr << "\t\t\tis observed (case-insensitive)." << endl << endl; - + cerr << "\t\t\tis observed (case-sensitive)." << endl << endl; + + cerr << "\t-C\tIgore case when matching -pattern. By defaulty, case matters." << endl << endl; cerr << "Output format: " << endl; cerr << "\tThe following information will be reported after each BED entry:" << endl; diff --git a/src/utils/sequenceUtilities/sequenceUtils.cpp b/src/utils/sequenceUtilities/sequenceUtils.cpp index 17af1828..1ab31929 100644 --- a/src/utils/sequenceUtilities/sequenceUtils.cpp +++ b/src/utils/sequenceUtilities/sequenceUtils.cpp @@ -110,17 +110,24 @@ void getDnaContent(const string &seq, int &a, int &c, int &g, int &t, int &n, in } -int countPattern(const string &seq, const string &pattern) +int countPattern(const string &seq, const string &pattern, bool ignoreCase) { - // swap the bases - int patternLength = pattern.size(); + string s = seq; + string p = pattern; + // standardize the seq and the pattern + // if case should be ignored + if (ignoreCase) { + toUpperCase(s); + toUpperCase(p); + } + int patternLength = p.size(); int patternCount = 0; - for(unsigned int i = 0; i < seq.length(); i++) { - if (seq.substr(i,patternLength) == pattern) { + for(unsigned int i = 0; i < s.length(); i++) { + if (s.substr(i,patternLength) == p) { patternCount++; } } return patternCount; } - + diff --git a/src/utils/sequenceUtilities/sequenceUtils.h b/src/utils/sequenceUtilities/sequenceUtils.h index 5090ef70..2296d66c 100644 --- a/src/utils/sequenceUtilities/sequenceUtils.h +++ b/src/utils/sequenceUtilities/sequenceUtils.h @@ -22,6 +22,6 @@ void toUpperCase(string &seq); // Calculates the number of a, c, g, t, n, and other bases found in a sequence void getDnaContent(const string &seq, int &a, int &c, int &g, int &t, int &n, int &other); -int countPattern(const string &seq, const string &pattern); +int countPattern(const string &seq, const string &pattern, bool ignoreCase); #endif -- GitLab