Merge branch 'merge'

9eb4337c · nkindlon · 8284c923 · 5eb3b68c · 9eb4337c · 9eb4337c
Commit 9eb4337c authored 10 years ago by nkindlon
--- a/src/mapFile/mapMain.cpp
+++ b/src/mapFile/mapMain.cpp
@@ -48,29 +48,8 @@ void map_help(void) {

    cerr << "Options: " << endl;

-    cerr << "\t-c\t"             << "Specify columns from the B file to map onto intervals in A." << endl;
-    cerr                         << "\t\tDefault: 5." << endl;
-    cerr						<<  "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl;
-
-    cerr << "\t-o\t"             << "Specify the operation that should be applied to -c." << endl;
-    cerr                         << "\t\tValid operations:" << endl;
-    cerr                         << "\t\t    sum, min, max, absmin, absmax," << endl;
-    cerr                         << "\t\t    mean, median," << endl;
-    cerr                         << "\t\t    collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
-    cerr                         << "\t\t    distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
-    cerr                         << "\t\t    count" << endl;
-    cerr                         << "\t\t    count_distinct (i.e., a count of the unique values in the column), " << endl;
-    cerr                         << "\t\tDefault: sum" << endl;
-    cerr						 << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl;
-
-    cerr						<< "\t\tIf there is only column, but multiple operations, all operations will be" << endl;
-    cerr						<< "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl;
-    cerr						<< "multiple columns, that operation will be applied to all columns." << endl;
-    cerr						<< "\t\tOtherwise, the number of columns must match the the number of operations," << endl;
-    cerr						<< "and will be applied in respective order." << endl;
-    cerr						<< "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl;
-    cerr						<< "the mean of column 4, and the count of column 6." << endl;
-    cerr						<< "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl;
+    KeyListOpsHelp();
+

    cerr << "\t-f\t"             << "Minimum overlap required as a fraction of A." << endl;
    cerr                         << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl;

--- a/src/mergeFile/mergeMain.cpp
+++ b/src/mergeFile/mergeMain.cpp
@@ -56,29 +56,13 @@ void merge_help(void) {
    cerr                                 << "\t\tthat are the same strand." << endl;
    cerr                                 << "\t\t- By default, merging is done without respect to strand." << endl << endl;

-    cerr << "\t-n\t"                     << "Report the number of BED entries that were merged." << endl;
-    cerr                                 << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl;
-

    cerr << "\t-d\t"                     << "Maximum distance between features allowed for features" << endl;
    cerr                                 << "\t\tto be merged." << endl;
    cerr                                 << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl;
    cerr                                 << "\t\t- (INTEGER)" << endl << endl;

-    cerr << "\t-nms\t"                   << "Report the names of the merged features separated by commas." << endl;
-    cerr                                 << "\t\tChange delim. with -delim." << endl << endl;
-    
-    cerr << "\t-scores\t"                << "Report the scores of the merged features. Specify one of " << endl;
-    cerr                                 << "\t\tthe following options for reporting scores:" << endl;
-    cerr                                 << "\t\t  sum, min, max," << endl;
-    cerr                                 << "\t\t  mean, median, mode, antimode," << endl;
-    cerr                                 << "\t\t  collapse (i.e., print a semicolon-separated list)," << endl;
-    cerr                                 << "\t\t- (INTEGER)" << endl << endl;
-    
-    cerr << "\t-delim\t"                 << "Specify a custom delimiter for the -nms and -scores concat options" << endl;
-    cerr                                 << "\t\t- Example: -delim \"|\"" << endl;
-    cerr                                 << "\t\t- Default: \",\"." << endl << endl;
-    
+    KeyListOpsHelp();
    
    cerr << "Notes: " << endl;
    cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl;

--- a/src/utils/Contexts/ContextBase.cpp
+++ b/src/utils/Contexts/ContextBase.cpp
@@ -52,6 +52,7 @@ ContextBase::ContextBase()
  _forwardOnly(false),
  _reverseOnly(false),
  _hasColumnOpsMethods(false),
+  _keyListOps(NULL),
  _desiredStrand(FileRecordMergeMgr::ANY_STRAND),
  _maxDistance(0),
  _useMergedIntervals(false)
@@ -459,10 +460,11 @@ bool ContextBase::handle_delim()

 void ContextBase::setColumnOpsMethods(bool val)
 {
-	_hasColumnOpsMethods = val;
-	if (val) {
+	if (val && !_hasColumnOpsMethods) {
+		//was off, but we're turning it on.
 		_keyListOps = new KeyListOps();
 	}
+	_hasColumnOpsMethods = val;
 }

 const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const {

--- a/src/utils/Contexts/ContextMerge.cpp
+++ b/src/utils/Contexts/ContextMerge.cpp
@@ -103,7 +103,7 @@ bool ContextMerge::isValidState()
 	}

 	//column operations not allowed with BAM input
-	if ((!_keyListOps->getColumns().empty() || !_keyListOps->getOperations().empty()) &&
+	if (hasColumnOpsMethods() &&
 			getFile(0)->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
 		_errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****";
 		return false;

--- a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp
+++ b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp
@@ -104,7 +104,11 @@ void SingleLineDelimTextFileReader::appendField(int fieldNum, QuickString &str)

 bool SingleLineDelimTextFileReader::detectAndHandleHeader()
 {
-	if (!isHeaderLine(_sLine)) {
+	//not sure why the linker is giving me a hard time about
+	//passing a non-const QuickString to isHeaderLine, but
+	//this const ref is a workaround.
+	const QuickString &sLine2 = _sLine;
+	if (!isHeaderLine(sLine2)) {
 		return false;
 	}
 	if (!_fullHeaderFound) {

--- a/src/utils/FileRecordTools/Records/BlockMgr.cpp
+++ b/src/utils/FileRecordTools/Records/BlockMgr.cpp
@@ -60,11 +60,8 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete)
    	return;
    }

-    vector<QuickString> sizes;
-    vector<QuickString> starts;
-
-    int sizeCount = Tokenize(keyRecord->getBlockSizes(), sizes, ',', blockCount);
-    int startCount = Tokenize(keyRecord->getBlockStarts(), starts, ',', blockCount);
+    int sizeCount = _blockSizeTokens.tokenize(keyRecord->getBlockSizes(), ',');
+    int startCount = _blockStartTokens.tokenize(keyRecord->getBlockStarts(), ',');

    if (blockCount != sizeCount || sizeCount != startCount) {
    	fprintf(stderr, "Error: found wrong block counts while splitting entry.\n");
@@ -72,8 +69,8 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete)
    }

    for (int i=0; i < blockCount; i++) {
-    	int startPos = keyRecord->getStartPos() + str2chrPos(starts[i].c_str());
-    	int endPos = startPos + str2chrPos(sizes[i].c_str());
+    	int startPos = keyRecord->getStartPos() + str2chrPos(_blockStartTokens.getElem(i).c_str());
+    	int endPos = startPos + str2chrPos(_blockSizeTokens.getElem(i).c_str());

    	const Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos);
    	keyList.push_back(record);

--- a/src/utils/FileRecordTools/Records/BlockMgr.h
+++ b/src/utils/FileRecordTools/Records/BlockMgr.h
@@ -16,6 +16,7 @@ using namespace std;
 #include "FileRecordTypeChecker.h"
 #include "RecordKeyList.h"

+
 class RecordMgr;

 class BlockMgr {
@@ -50,6 +51,8 @@ private:

 	float _overlapFraction;
 	bool _hasReciprocal;
+	Tokenizer _blockSizeTokens;
+	Tokenizer _blockStartTokens;

 	// For now, all records will be split into Bed6 records.
 	const static FileRecordTypeChecker::RECORD_TYPE _blockRecordsType = FileRecordTypeChecker::BED6_RECORD_TYPE;

--- a/src/utils/GenomeFile/NewGenomeFile.cpp
+++ b/src/utils/GenomeFile/NewGenomeFile.cpp
@@ -11,6 +11,7 @@
 ******************************************************************************/
 #include "NewGenomeFile.h"
 #include "ParseTools.h"
+#include "Tokenizer.h"

 NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename)
 : _maxId(-1)
@@ -44,21 +45,20 @@ void NewGenomeFile::loadGenomeFileIntoMap() {
 		exit(1);
 	}
 	string sLine;
-	vector<QuickString> fields;
+	Tokenizer fieldTokens;
 	CHRPOS chrSize = 0;
 	QuickString chrName;
 	while (!genFile.eof()) {
 		sLine.clear();
-		fields.clear();
 		chrSize = 0;
 		chrName.clear();
 		getline(genFile, sLine);
-		Tokenize(sLine.c_str(), fields);
-		if (fields.size() != 2) {
+		int numFields = fieldTokens.tokenize(sLine.c_str());
+		if (numFields != 2) {
 			continue;
 		}
-		chrName = fields[0];
-		chrSize = str2chrPos(fields[1]);
+		chrName = fieldTokens.getElem(0);
+		chrSize = str2chrPos(fieldTokens.getElem(1));
 		_maxId++;
 		_chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId);
 		_startOffsets.push_back(_genomeLength);

--- a/src/utils/KeyListOps/KeyListOps.cpp
+++ b/src/utils/KeyListOps/KeyListOps.cpp
@@ -98,25 +98,28 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) {
 	//member of each pair is a column number, and the second member is the code for the
 	//operation to perform on that column.

-	vector<QuickString> columnsVec;
-	vector<QuickString> opsVec;
-	int numCols = Tokenize(_columns, columnsVec, ',');
-	int numOps = Tokenize(_operations, opsVec, ',');
+    Tokenizer colTokens;
+    Tokenizer opsTokens;
+
+    int numCols = colTokens.tokenize(_columns, ',');
+	int numOps = opsTokens.tokenize(_operations, ',');

 	if (numOps < 1 || numCols < 1) {
 		 cerr << endl << "*****" << endl
 		             << "***** ERROR: There must be at least one column and at least one operation named." << endl;
 		 return false;
 	}
-	if (numOps > 1 && numCols != numOps) {
+	if (numOps > 1 && numCols > 1 && numCols != numOps) {
 		 cerr << endl << "*****" << endl
 		             << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl;
 		cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl;
+		cerr << "\ta single column to which all operations will be applied," << endl;
 		cerr << "\tor an operation for each column." << endl;
 		return false;
 	}
-	for (int i=0; i < (int)columnsVec.size(); i++) {
-		int col = str2chrPos(columnsVec[i]);
+	int loop = max(numCols, numOps);
+	for (int i=0; i < loop; i++) {
+		int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0));

 		//check that the column number is valid
 		if (col < 1 || col > dbFile->getNumFields()) {
@@ -124,7 +127,7 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) {
 					 << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl;
 			 return false;
 		}
-		const QuickString &operation = opsVec.size() > 1 ? opsVec[i] : opsVec[0];
+		const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0);
 		OP_TYPES opCode = getOpCode(operation);
 		if (opCode == INVALID) {
 			cerr << endl << "*****" << endl
@@ -361,4 +364,33 @@ const QuickString & KeyListOps::getOpVals(RecordKeyList &hits)
 	return _outVals;
 }

+void KeyListOpsHelp() {
+    cerr << "\t-c\t"             << "Specify columns from the B file to map onto intervals in A." << endl;
+    cerr                         << "\t\tDefault: 5." << endl;
+    cerr						<<  "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl;
+
+    cerr << "\t-o\t"             << "Specify the operation that should be applied to -c." << endl;
+    cerr                         << "\t\tValid operations:" << endl;
+    cerr                         << "\t\t    sum, min, max, absmin, absmax," << endl;
+    cerr                         << "\t\t    mean, median," << endl;
+    cerr                         << "\t\t    collapse (i.e., print a delimited list (duplicates allowed)), " << endl;
+    cerr                         << "\t\t    distinct (i.e., print a delimited list (NO duplicates allowed)), " << endl;
+    cerr                         << "\t\t    count" << endl;
+    cerr                         << "\t\t    count_distinct (i.e., a count of the unique values in the column), " << endl;
+    cerr                         << "\t\tDefault: sum" << endl;
+    cerr						 << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl;
+
+    cerr						<< "\t\tIf there is only column, but multiple operations, all operations will be" << endl;
+    cerr						<< "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl;
+    cerr						<< "multiple columns, that operation will be applied to all columns." << endl;
+    cerr						<< "\t\tOtherwise, the number of columns must match the the number of operations," << endl;
+    cerr						<< "and will be applied in respective order." << endl;
+    cerr						<< "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl;
+    cerr						<< "the mean of column 4, and the count of column 6." << endl;
+    cerr						<< "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl;
+
+    cerr << "\t-delim\t"                 << "Specify a custom delimiter for the collapse operations." << endl;
+    cerr                                 << "\t\t- Example: -delim \"|\"" << endl;
+    cerr                                 << "\t\t- Default: \",\"." << endl << endl;

+}
--- a/src/utils/KeyListOps/KeyListOps.h
+++ b/src/utils/KeyListOps/KeyListOps.h
@@ -12,6 +12,9 @@

 class FileRecordMgr;

+//print help message
+void KeyListOpsHelp();
+
 class KeyListOps {
 public:


--- a/src/utils/fileType/FileRecordTypeChecker.cpp
+++ b/src/utils/fileType/FileRecordTypeChecker.cpp
@@ -13,7 +13,6 @@ FileRecordTypeChecker::FileRecordTypeChecker()
 	_isBed = false;
 	_isDelimited = false;
 	_delimChar = '\t'; //tab by default
-	_lines.clear();
 	_firstValidDataLineIdx = -1;
 	_isVCF = false;
 	_isBAM = false;
@@ -161,9 +160,14 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len)
 		_fileType = SINGLE_LINE_DELIM_TEXT_FILE_TYPE;

 		//Tokenize the first line of valid data into fields.
-		const QuickString &line = _lines[_firstValidDataLineIdx];
-		_currLineElems.clear();
-		if (Tokenize(line, _currLineElems, _delimChar, _numFields) != _numFields) {
+		//Need to make a copy so next call to tokenizer doesn't overwrite the line.
+
+		QuickString line(_tokenizer.getElem(_firstValidDataLineIdx));
+
+		_tokenizer.setKeepFinalIncompleteElem(Tokenizer::USE_NOW);
+		_tokenizer.setNumExpectedItems(_numFields);
+
+		if (_tokenizer.tokenize(line, _delimChar) != _numFields) {
 			cerr << "Error: Type checker found wrong number of fields while tokenizing data line." << endl;
 			exit(1);
 		}
@@ -173,7 +177,7 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len)
 			if (_numFields == 3) {
 				_recordType = BED3_RECORD_TYPE;
 			} else if (_numFields == 4) {
-				if (isNumeric(_currLineElems[3])) {
+				if (isNumeric(_tokenizer.getElem(3))) {
 					_recordType = BEDGRAPH_RECORD_TYPE;
 					_fourthFieldNumeric = true;
 				} else {
@@ -223,12 +227,12 @@ bool FileRecordTypeChecker::isBedFormat() {
 		return false;
 	}
 	//the 2nd and 3rd fields must be numeric.
-	if (!isNumeric(_currLineElems[1]) || !isNumeric(_currLineElems[2])) {
+	if (!isNumeric(_tokenizer.getElem(1)) || !isNumeric(_tokenizer.getElem(2))) {
 		return false;
 	}

-	int start = str2chrPos(_currLineElems[1]);
-	int end = str2chrPos(_currLineElems[2]);
+	int start = str2chrPos(_tokenizer.getElem(1));
+	int end = str2chrPos(_tokenizer.getElem(2));
 	if (end < start) {
 		return false;
 	}
@@ -242,11 +246,11 @@ bool FileRecordTypeChecker::isGFFformat()
 		return false;
 	}
 	//the 4th and 5th fields must be numeric.
-	if (!isNumeric(_currLineElems[3]) || !isNumeric(_currLineElems[4])) {
+	if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) {
 		return false;
 	}
-	int start = str2chrPos(_currLineElems[3]);
-	int end = str2chrPos(_currLineElems[4]);
+	int start = str2chrPos(_tokenizer.getElem(3));
+	int end = str2chrPos(_tokenizer.getElem(4));
 	if (end < start) {
 		return false;
 	}
@@ -256,8 +260,8 @@ bool FileRecordTypeChecker::isGFFformat()
 bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
 {
 	//Break single string buffer into vector of QuickStrings. Delimiter is newline.
-	_lines.clear();
-	int numLines = Tokenize(buffer, _lines, '\n', len);
+	_tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE);
+	int numLines = _tokenizer.tokenize(buffer, '\n');

 	//anticipated delimiter characters are tab, comma, and semi-colon.
 	//If we need new ones, they must be added in this method.
@@ -283,7 +287,7 @@ bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
 		if (validLinesFound >=4) {
 			break; //really only need to look at like 4 lines of data, max.
 		}
-		QuickString &line = _lines[i];
+		const QuickString &line = _tokenizer.getElem(i);
 		int len =line.size();
 		//skip over any empty line
 		if (len == 0) {

--- a/src/utils/fileType/FileRecordTypeChecker.h
+++ b/src/utils/fileType/FileRecordTypeChecker.h
@@ -18,6 +18,7 @@ using namespace std;
 #include <vector>
 #include <map>
 #include "PushBackStreamBuf.h"
+#include "Tokenizer.h"

 class FileRecordTypeChecker {
 public:
@@ -87,8 +88,8 @@ private:
 	RECORD_TYPE _recordType;

 	QuickString _filename; //useful for reporting errors with file.
-	vector<QuickString> _lines;
-	vector<QuickString> _currLineElems;
+	Tokenizer _tokenizer;
+
 	int _firstValidDataLineIdx;
 	int _numBytesInBuffer; //this will hold the length of the buffer after the scan.


--- a/src/utils/general/Makefile
+++ b/src/utils/general/Makefile
@@ -9,8 +9,8 @@ INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/
 # ----------------------------------
 # define our source and object files
 # ----------------------------------
-SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp
-OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o
+SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp Tokenizer.h Tokenizer.h
+OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o Tokenizer.o
 BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))

 all: $(BUILT_OBJECTS)
@@ -23,6 +23,6 @@ $(BUILT_OBJECTS): $(SOURCES)

 clean:
 	@echo "Cleaning up."
-	@rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o
+	@rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o $(OBJ_DIR)/Tokenizer.o

 .PHONY: clean
--- a/src/utils/general/ParseTools.cpp
+++ b/src/utils/general/ParseTools.cpp
@@ -93,35 +93,7 @@ string vectorIntToStr(const vector<int> &vec) {
 	return str;
 }

-// TBD: Could be better optimized. I'm allocating 8KB for every call, then destroying it.
-// That memory needs to stay in scope.
-// Also, is this handling subsequent delimiters?
-int Tokenize(const QuickString &str, vector<QuickString> &elems, char delimiter, int numExpectedItems) {
-
-	elems.reserve(numExpectedItems);
-	int strLen = (int)str.size();
-
-	int startPos = 0;
-	int currPos = 0;
-
-	char elemBuf[8192];
-
-	while (startPos < strLen) {
-		memset(elemBuf, 0, 8192);
-		while (str[currPos] != delimiter && currPos < strLen) {
-			currPos++;
-		}
-		if (currPos > startPos) {
-			memcpy(elemBuf, str.c_str() + startPos, min(currPos, strLen) - startPos);
-			elems.push_back(elemBuf);
-		}
-		startPos = currPos +1;
-		currPos = startPos;
-	}
-	return (int)elems.size();
-}
-
-bool isHeaderLine(QuickString &line) {
+bool isHeaderLine(const QuickString &line) {
 	if (line[0] == '>') {
 		return true;
 	}
@@ -143,4 +115,3 @@ bool isHeaderLine(QuickString &line) {
 	}
 	return false;
 }
-
--- a/src/utils/general/ParseTools.h
+++ b/src/utils/general/ParseTools.h
@@ -86,7 +86,7 @@ void int2str(int number, T& buffer, bool appendToBuf = false)

 }

-bool isHeaderLine(QuickString &line);
+bool isHeaderLine(const QuickString &line);

 string vectorIntToStr(const vector<int> &vec);


--- a/src/utils/general/Tokenizer.cpp
+++ b/src/utils/general/Tokenizer.cpp
+/*
+ * Tokenizer.cpp
+ *
+ *  Created on: Apr 15, 2014
+ *      Author: nek3d
+ */
+#include "Tokenizer.h"
+#include <cstring>
+
+Tokenizer::Tokenizer()
+: _numExpectedElems(0),
+  _keepFinalIncElem(USE_NOW),
+  _numValidElems(0) {
+
+	_elems.resize(INITIAL_NUM_ELEMS, NULL);
+	for (int i = 0; i < INITIAL_NUM_ELEMS; i++) {
+		_elems[i] = new QuickString();
+	}
+}
+
+Tokenizer::~Tokenizer() {
+	resize(0); //easy way to delete elems without repeating code.
+}
+
+void Tokenizer::setNumExpectedItems(int newSize) {
+	_numExpectedElems = newSize;
+	resize(newSize);
+}
+
+int Tokenizer::tokenize(const QuickString &str, char delimiter) {
+
+	int strLen = (int)str.size();
+
+	int startPos = 0;
+	int currPos = 0;
+
+	int currIdx = 0;
+
+	while (startPos < strLen) {
+		while (str[currPos] != delimiter && currPos < strLen) {
+			currPos++;
+		}
+		if (currPos > startPos) {
+			if (currPos == strLen && _keepFinalIncElem != USE_NOW) {
+				//we found an incomplete final element.
+				// if we're ignoring incomplete elems, do nothing with it.
+				currIdx--; //make sure it's not included in the final count of valid elems.
+
+			} else {
+				QuickString *newStr = fetchElem(currIdx);
+				newStr->assign(str.c_str() + startPos, min(currPos, strLen) - startPos);
+			}
+		}
+		startPos = currPos +1;
+		currPos = startPos;
+		currIdx++;
+	}
+	_numValidElems = currIdx;
+	return currIdx;
+}
+
+void Tokenizer::setKeepFinalIncompleteElem(lastElemCode code) {
+	_keepFinalIncElem = code;
+}
+
+QuickString *Tokenizer::fetchElem(int idx)
+{
+	if (idx >= (int)_elems.size()) {
+		resize(idx +1);
+	}
+	return _elems[idx];
+}
+
+
+void Tokenizer::resize(int newSize) {
+	int oldSize = (int)_elems.size();
+
+	if (newSize > oldSize) { //need to add items.
+		_elems.resize(newSize);
+		for (int i=oldSize; i < newSize; i++) {
+			_elems[i] = new QuickString();
+		}
+	} else if (oldSize > newSize) {
+		//need to remove items.
+		for (int i = oldSize - 1; i >= newSize; i--) {
+			delete _elems[i];
+			_elems[i] = NULL;
+		}
+		_elems.resize(newSize);
+	}
+	//if oldSize is the same as newSize, do nothing.
+}
+
--- a/src/utils/general/Tokenizer.h
+++ b/src/utils/general/Tokenizer.h
+/*
+ * Tokenizer.h
+ *
+ *  Created on: Apr 15, 2014
+ *      Author: nek3d
+ */
+
+#ifndef TOKENIZER_H_
+#define TOKENIZER_H_
+
+using namespace std;
+
+#include "QuickString.h"
+#include <vector>
+
+class Tokenizer {
+public:
+	Tokenizer();
+	~Tokenizer();
+
+	// If you know the expected number of items, set this.
+	// If not, don't worry about it.
+	void setNumExpectedItems(int val);
+
+	int tokenize(const QuickString &str, char delimiter = '\t');
+
+	// If the final element ends before a delim char, that means
+	// the buffer passed in ends mid-element. The last, incomplete
+	// element found can either be:
+	// 1) Used now. We want it whether it's complete or not.
+	// 3) Ignored altogether.
+	typedef enum { USE_NOW, IGNORE } lastElemCode;
+	void setKeepFinalIncompleteElem(lastElemCode code);
+
+	//final number of valid elems may be less than total number of elems,
+	//because elems are not necessarily deleted between subsequent calls
+	//to tokenizer.
+	int getNumValidElems() const { return _numValidElems; }
+	int getNumTotalElems() const { return (int)_elems.size(); }
+	const QuickString &getElem(int i) const { return (*(_elems[i])); }
+
+
+
+private:
+	static const int DEFAULT_PARSE_BUFFER_SIZE = 4096; // 8Kb
+	static const int INITIAL_NUM_ELEMS = 10;
+	vector<QuickString *> _elems;
+	int _numExpectedElems;
+	lastElemCode _keepFinalIncElem;
+	int _numValidElems;
+
+	QuickString *fetchElem(int idx);
+	void resize(int newSize);
+};
+
+
+#endif /* TOKENIZER_H_ */
--- a/test/map/test-map.sh
+++ b/test/map/test-map.sh
@@ -691,10 +691,13 @@ rm obs exp
 ############################################################
 echo "    map.t46...\c"
 echo \
-"
-*****
-***** ERROR: There are 1 columns given, but there are 2 operations."  > exp
-$BT map -a ivls.bed -b values.bed -o count,sum 2>&1 > /dev/null | head -3 > obs
+"chr1	0	100	3	30
+chr1	100	200	1	1
+chr2	0	100	0	.
+chr2	100	200	0	.
+chr3	0	100	3	6
+chr3	100	200	1	4" > exp
+$BT map -a ivls.bed -b values.bed -o count,sum  > obs
 check obs exp
 rm obs exp


--- a/test/merge/test-merge.sh
+++ b/test/merge/test-merge.sh
@@ -18,7 +18,6 @@ check()
 # chr1	45	100

 ###########################################################
-# Test #1
 #  Test a basic merge; one interval should be un-merged, 
 #  the other two should be merged.
 ###########################################################
@@ -31,71 +30,49 @@ check obs exp
 rm obs exp

 ###########################################################
-#
-# NOTE: Testing for sorted input is now deprecated, as the
-# FileRecordMgr is already testing for that.
-#
-###########################################################
-# Test #2
-#  Enforce coordinate sorted input.
-###########################################################
-#echo "    merge.t2...\c"
-#command -v tac 2>/dev/null || alias tac="sed '1!G;h;\$!d'"
-#tac a.bed | $BT merge -i - 2> obs
-#echo "ERROR: input file: (-) is not sorted by chrom then start.
-#       The start coordinate at line 3 is less than the start at line 2" > exp
-#check obs exp
-#rm obs exp
+#  Test that -n option is shown as deperecated
+###########################################################
+echo "    merge.t2...\c"
+echo "***** ERROR: -n option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp
+$BT merge -i a.bed -n 2>&1 > /dev/null | head -2 | tail -1 > obs
+check obs exp
+rm obs exp


 ###########################################################
-# Test #3
 #  Test the counting of merged intervals. (-n)
 ###########################################################
 echo "    merge.t3...\c"
 echo \
 "chr1	10	20	1
 chr1	30	100	3" > exp
-$BT merge -i a.bed -n > obs
+$BT merge -i a.bed -c 1 -o count > obs
 check obs exp
 rm obs exp


 ###########################################################
-# Test #4
-#  Test the listing of names from merged intervals. (-nms)
-#  a.bed should fail, as there is no name field
+#  Test that -nms option is deprecated
 ###########################################################
 echo "    merge.t4...\c"
-echo \
-"*****
-***** ERROR: Requested column 4, but database file a.bed only has fields 1 - 3." > exp
-$BT merge -i a.bed -nms 2>&1 > /dev/null | head -3 | tail -2 > obs
+echo "***** ERROR: -nms option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp
+$BT merge -i a.bed -nms 2>&1 > /dev/null | head -2 | tail -1 > obs
 check obs exp
 rm obs exp

-
 ###########################################################
-# Test #5
-#  Test the listing of names from merged intervals. (-nms)
-#  a.named.bed should work, as there are name fields
-#  
-# cat a.names.bed
-# chr1	10	20	a1
-# chr1	30	40	a2
-# chr1	40	50	a3
-# chr1	45	100	a4
+#  Test the listing of names from merged intervals.
 ###########################################################
 echo "    merge.t5...\c"
 echo \
 "chr1	10	20	a1
 chr1	30	100	a2,a3,a4" > exp
-$BT merge -i a.names.bed -nms > obs
+$BT merge -i a.names.bed -c 4 -o collapse > obs
 check obs exp
 rm obs exp

 ###########################################################
-# -nms and -scores sum
+# collapsed list of the names, and sum of the scores
 ###########################################################
 echo "    merge.t6...\c"
 echo \
@@ -104,12 +81,12 @@ chr1	30	100	a2,a3,a4	9
 chr2	10	20	a1	5
 chr2	30	40	a2	6
 chr2	42	100	a3,a4	15" > exp
-$BT merge -i a.full.bed -nms -scores sum> obs
+$BT merge -i a.full.bed -c 4,5  -o collapse,sum > obs
 check obs exp
 rm obs exp

 ###########################################################
-# -n and -scores sum
+# count intervals and sum of scores
 ###########################################################
 echo "    merge.t7...\c"
 echo \
@@ -118,12 +95,12 @@ chr1	30	100	3	9
 chr2	10	20	1	5
 chr2	30	40	1	6
 chr2	42	100	2	15" > exp
-$BT merge -i a.full.bed -n -scores sum> obs
+$BT merge -i a.full.bed -c 5 -o count,sum> obs
 check obs exp
 rm obs exp

 ###########################################################
-# -n, -nms, and -scores sum
+# count, collapsed names, and sum of scores
 ###########################################################
 echo "    merge.t8...\c"
 echo \
@@ -132,12 +109,13 @@ chr1	30	100	a2,a3,a4	9	3
 chr2	10	20	a1	5	1
 chr2	30	40	a2	6	1
 chr2	42	100	a3,a4	15	2" > exp
-$BT merge -i a.full.bed -nms -scores sum -n> obs
+$BT merge -i a.full.bed -c 4,5,4 -o collapse,sum,count > obs
 check obs exp
 rm obs exp

 ###########################################################
-# -s, -n, -nms, and -scores sum
+# stranded merge, show sign, collapsed names, sum of
+# scores, and count
 ###########################################################
 echo "    merge.t9...\c"
 echo \
@@ -149,24 +127,17 @@ chr2	10	20	+	a1	5	1
 chr2	30	40	+	a2	6	1
 chr2	42	50	+	a3	7	1
 chr2	45	100	-	a4	8	1" > exp
-$BT merge -i a.full.bed -s -nms -scores sum -n> obs
+$BT merge -i a.full.bed -s -c 6,4,5,6 -o distinct,collapse,sum,count > obs
 check obs exp
 rm obs exp

 ###########################################################
-# Test #10
 #  Test the use of a custom delimiter for -nms
-#  
-# cat a.names.bed
-# chr1	10	20	a1
-# chr1	30	40	a2
-# chr1	40	50	a3
-# chr1	45	100	a4
 ###########################################################
 echo "    merge.t10...\c"
 echo \
 "chr1	10	20	a1
 chr1	30	100	a2|a3|a4" > exp
-$BT merge -i a.names.bed -nms -delim "|" > obs
+$BT merge -i a.names.bed -delim "|" -c 4 -o collapse > obs
 check obs exp
 rm obs exp