From 574b93490d84db51c2f9d786b1c3d5fb9e092386 Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Fri, 9 Mar 2012 10:14:50 -0500
Subject: [PATCH] Added -C (case-insensitive) matching to nucBed.

---
 src/nucBed/nucBed.cpp                         |  8 +++++---
 src/nucBed/nucBed.h                           |  3 ++-
 src/nucBed/nucBedMain.cpp                     | 17 ++++++++++++++---
 src/utils/sequenceUtilities/sequenceUtils.cpp | 19 +++++++++++++------
 src/utils/sequenceUtilities/sequenceUtils.h   |  2 +-
 5 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/nucBed/nucBed.cpp b/src/nucBed/nucBed.cpp
index acc97c65..446f6a6b 100644
--- a/src/nucBed/nucBed.cpp
+++ b/src/nucBed/nucBed.cpp
@@ -14,7 +14,9 @@
 
 
 NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, 
-               bool hasPattern, const string &pattern, bool forceStrand) {
+               bool hasPattern, const string &pattern, 
+               bool forceStrand, bool ignoreCase) 
+{
 
     _dbFile       = dbFile;
     _bedFile      = bedFile;
@@ -22,7 +24,7 @@ NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq,
     _hasPattern   = hasPattern;
     _pattern      = pattern;
     _forceStrand  = forceStrand;
-    
+    _ignoreCase   = ignoreCase;
     _bed = new BedFile(_bedFile);
 
     // Compute the DNA content in each BED/GFF/VCF interval
@@ -42,7 +44,7 @@ void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLen
     getDnaContent(sequence,a,c,g,t,n,other);
     
     if (_hasPattern)
-        userPatternCount = countPattern(sequence, _pattern);
+        userPatternCount = countPattern(sequence, _pattern, _ignoreCase);
     
     
     // report the original interval
diff --git a/src/nucBed/nucBed.h b/src/nucBed/nucBed.h
index 2fe40ddf..ba45af4f 100644
--- a/src/nucBed/nucBed.h
+++ b/src/nucBed/nucBed.h
@@ -31,7 +31,7 @@ public:
     // constructor
     NucBed(string &dbFile, string &bedFile, bool printSeq, 
            bool hasPattern, const string &pattern,
-           bool forceStrand);
+           bool forceStrand, bool ignoreCase);
     // destructor
     ~NucBed(void);
 
@@ -45,6 +45,7 @@ private:
     bool   _hasPattern;
     string _pattern;
     bool _forceStrand;
+    bool _ignoreCase;
 
     // instance of a bed file class.
     BedFile  *_bed;
diff --git a/src/nucBed/nucBedMain.cpp b/src/nucBed/nucBedMain.cpp
index c2e4a523..eff8bec3 100644
--- a/src/nucBed/nucBedMain.cpp
+++ b/src/nucBed/nucBedMain.cpp
@@ -40,6 +40,7 @@ int nuc_main(int argc, char* argv[]) {
     bool printSeq    = false;
     bool hasPattern  = false;
     bool forceStrand = false;
+    bool ignoreCase  = false;
 
     // check to see if we should print out some help
     if(argc <= 1) showHelp = true;
@@ -80,6 +81,9 @@ int nuc_main(int argc, char* argv[]) {
         else if(PARAMETER_CHECK("-s", 2, parameterLength)) {
             forceStrand = true;
         }
+        else if(PARAMETER_CHECK("-C", 2, parameterLength)) {
+            ignoreCase = true;
+        }
         else if(PARAMETER_CHECK("-pattern", 8, parameterLength)) {
             if ((i+1) < argc) {
                 hasPattern = true;
@@ -99,7 +103,8 @@ int nuc_main(int argc, char* argv[]) {
 
     if (!showHelp) {
 
-        NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq, hasPattern, pattern, forceStrand);
+        NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq, 
+                                 hasPattern, pattern, forceStrand, ignoreCase);
         delete nuc;
     }
     else {
@@ -117,13 +122,19 @@ void nuc_help(void) {
     cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -fi <fasta> -bed <bed/gff/vcf>" << endl << endl;
 
     cerr << "Options: " << endl;
+
     cerr << "\t-fi\tInput FASTA file" << endl << endl;
+
     cerr << "\t-bed\tBED/GFF/VCF file of ranges to extract from -fi" << endl << endl;
+
     cerr << "\t-s\tProfile the sequence according to strand." << endl << endl;
+
     cerr << "\t-seq\tPrint the extracted sequence" << endl << endl;
+
     cerr << "\t-pattern\tReport the number of times a user-defined sequence" << endl;
-    cerr << "\t\t\tis observed (case-insensitive)." << endl << endl;    
-    
+    cerr << "\t\t\tis observed (case-sensitive)." << endl << endl;    
+
+    cerr << "\t-C\tIgore case when matching -pattern. By defaulty, case matters." << endl << endl;
     
     cerr << "Output format: " << endl;
     cerr << "\tThe following information will be reported after each BED entry:" << endl;
diff --git a/src/utils/sequenceUtilities/sequenceUtils.cpp b/src/utils/sequenceUtilities/sequenceUtils.cpp
index 17af1828..1ab31929 100644
--- a/src/utils/sequenceUtilities/sequenceUtils.cpp
+++ b/src/utils/sequenceUtilities/sequenceUtils.cpp
@@ -110,17 +110,24 @@ void getDnaContent(const string &seq, int &a, int &c, int &g, int &t, int &n, in
 }
 
 
-int countPattern(const string &seq, const string &pattern)
+int countPattern(const string &seq, const string &pattern, bool ignoreCase)
 {
-    // swap the bases
-    int patternLength = pattern.size();
+    string s = seq;
+    string p = pattern;
+    // standardize the seq and the pattern 
+    // if case should be ignored
+    if (ignoreCase) {
+        toUpperCase(s);
+        toUpperCase(p);
+    }
+    int patternLength = p.size();
     int patternCount = 0;
-    for(unsigned int i = 0; i < seq.length(); i++) {
-        if (seq.substr(i,patternLength) == pattern) {
+    for(unsigned int i = 0; i < s.length(); i++) {
+        if (s.substr(i,patternLength) == p) {
             patternCount++;
         }
     }
     return patternCount;
 }
-        
+
 
diff --git a/src/utils/sequenceUtilities/sequenceUtils.h b/src/utils/sequenceUtilities/sequenceUtils.h
index 5090ef70..2296d66c 100644
--- a/src/utils/sequenceUtilities/sequenceUtils.h
+++ b/src/utils/sequenceUtilities/sequenceUtils.h
@@ -22,6 +22,6 @@ void toUpperCase(string &seq);
 // Calculates the number of a, c, g, t, n, and other bases found in a sequence
 void getDnaContent(const string &seq, int &a, int &c, int &g, int &t, int &n, int &other);
 
-int countPattern(const string &seq, const string &pattern);
+int countPattern(const string &seq, const string &pattern, bool ignoreCase);
 
 #endif
-- 
GitLab