Skip to content
Snippets Groups Projects
Commit 9eb4337c authored by nkindlon's avatar nkindlon
Browse files

Merge branch 'merge'

parents 8284c923 5eb3b68c
No related branches found
No related tags found
No related merge requests found
Showing
with 275 additions and 171 deletions
......@@ -48,29 +48,8 @@ void map_help(void) {
cerr << "Options: " << endl;
cerr << "\t-c\t" << "Specify columns from the B file to map onto intervals in A." << endl;
cerr << "\t\tDefault: 5." << endl;
cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl;
cerr << "\t-o\t" << "Specify the operation that should be applied to -c." << endl;
cerr << "\t\tValid operations:" << endl;
cerr << "\t\t sum, min, max, absmin, absmax," << endl;
cerr << "\t\t mean, median," << endl;
cerr << "\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
cerr << "\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
cerr << "\t\t count" << endl;
cerr << "\t\t count_distinct (i.e., a count of the unique values in the column), " << endl;
cerr << "\t\tDefault: sum" << endl;
cerr << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl;
cerr << "\t\tIf there is only column, but multiple operations, all operations will be" << endl;
cerr << "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl;
cerr << "multiple columns, that operation will be applied to all columns." << endl;
cerr << "\t\tOtherwise, the number of columns must match the the number of operations," << endl;
cerr << "and will be applied in respective order." << endl;
cerr << "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl;
cerr << "the mean of column 4, and the count of column 6." << endl;
cerr << "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl;
KeyListOpsHelp();
cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl;
cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl;
......
......@@ -56,29 +56,13 @@ void merge_help(void) {
cerr << "\t\tthat are the same strand." << endl;
cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl;
cerr << "\t-n\t" << "Report the number of BED entries that were merged." << endl;
cerr << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl;
cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl;
cerr << "\t\tto be merged." << endl;
cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "\t-nms\t" << "Report the names of the merged features separated by commas." << endl;
cerr << "\t\tChange delim. with -delim." << endl << endl;
cerr << "\t-scores\t" << "Report the scores of the merged features. Specify one of " << endl;
cerr << "\t\tthe following options for reporting scores:" << endl;
cerr << "\t\t sum, min, max," << endl;
cerr << "\t\t mean, median, mode, antimode," << endl;
cerr << "\t\t collapse (i.e., print a semicolon-separated list)," << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "\t-delim\t" << "Specify a custom delimiter for the -nms and -scores concat options" << endl;
cerr << "\t\t- Example: -delim \"|\"" << endl;
cerr << "\t\t- Default: \",\"." << endl << endl;
KeyListOpsHelp();
cerr << "Notes: " << endl;
cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl;
......
......@@ -52,6 +52,7 @@ ContextBase::ContextBase()
_forwardOnly(false),
_reverseOnly(false),
_hasColumnOpsMethods(false),
_keyListOps(NULL),
_desiredStrand(FileRecordMergeMgr::ANY_STRAND),
_maxDistance(0),
_useMergedIntervals(false)
......@@ -459,10 +460,11 @@ bool ContextBase::handle_delim()
void ContextBase::setColumnOpsMethods(bool val)
{
_hasColumnOpsMethods = val;
if (val) {
if (val && !_hasColumnOpsMethods) {
//was off, but we're turning it on.
_keyListOps = new KeyListOps();
}
_hasColumnOpsMethods = val;
}
const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const {
......
......@@ -103,7 +103,7 @@ bool ContextMerge::isValidState()
}
//column operations not allowed with BAM input
if ((!_keyListOps->getColumns().empty() || !_keyListOps->getOperations().empty()) &&
if (hasColumnOpsMethods() &&
getFile(0)->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
_errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****";
return false;
......
......@@ -104,7 +104,11 @@ void SingleLineDelimTextFileReader::appendField(int fieldNum, QuickString &str)
bool SingleLineDelimTextFileReader::detectAndHandleHeader()
{
if (!isHeaderLine(_sLine)) {
//not sure why the linker is giving me a hard time about
//passing a non-const QuickString to isHeaderLine, but
//this const ref is a workaround.
const QuickString &sLine2 = _sLine;
if (!isHeaderLine(sLine2)) {
return false;
}
if (!_fullHeaderFound) {
......
......@@ -60,11 +60,8 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete)
return;
}
vector<QuickString> sizes;
vector<QuickString> starts;
int sizeCount = Tokenize(keyRecord->getBlockSizes(), sizes, ',', blockCount);
int startCount = Tokenize(keyRecord->getBlockStarts(), starts, ',', blockCount);
int sizeCount = _blockSizeTokens.tokenize(keyRecord->getBlockSizes(), ',');
int startCount = _blockStartTokens.tokenize(keyRecord->getBlockStarts(), ',');
if (blockCount != sizeCount || sizeCount != startCount) {
fprintf(stderr, "Error: found wrong block counts while splitting entry.\n");
......@@ -72,8 +69,8 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete)
}
for (int i=0; i < blockCount; i++) {
int startPos = keyRecord->getStartPos() + str2chrPos(starts[i].c_str());
int endPos = startPos + str2chrPos(sizes[i].c_str());
int startPos = keyRecord->getStartPos() + str2chrPos(_blockStartTokens.getElem(i).c_str());
int endPos = startPos + str2chrPos(_blockSizeTokens.getElem(i).c_str());
const Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos);
keyList.push_back(record);
......
......@@ -16,6 +16,7 @@ using namespace std;
#include "FileRecordTypeChecker.h"
#include "RecordKeyList.h"
class RecordMgr;
class BlockMgr {
......@@ -50,6 +51,8 @@ private:
float _overlapFraction;
bool _hasReciprocal;
Tokenizer _blockSizeTokens;
Tokenizer _blockStartTokens;
// For now, all records will be split into Bed6 records.
const static FileRecordTypeChecker::RECORD_TYPE _blockRecordsType = FileRecordTypeChecker::BED6_RECORD_TYPE;
......
......@@ -11,6 +11,7 @@
******************************************************************************/
#include "NewGenomeFile.h"
#include "ParseTools.h"
#include "Tokenizer.h"
NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename)
: _maxId(-1)
......@@ -44,21 +45,20 @@ void NewGenomeFile::loadGenomeFileIntoMap() {
exit(1);
}
string sLine;
vector<QuickString> fields;
Tokenizer fieldTokens;
CHRPOS chrSize = 0;
QuickString chrName;
while (!genFile.eof()) {
sLine.clear();
fields.clear();
chrSize = 0;
chrName.clear();
getline(genFile, sLine);
Tokenize(sLine.c_str(), fields);
if (fields.size() != 2) {
int numFields = fieldTokens.tokenize(sLine.c_str());
if (numFields != 2) {
continue;
}
chrName = fields[0];
chrSize = str2chrPos(fields[1]);
chrName = fieldTokens.getElem(0);
chrSize = str2chrPos(fieldTokens.getElem(1));
_maxId++;
_chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId);
_startOffsets.push_back(_genomeLength);
......
......@@ -98,25 +98,28 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) {
//member of each pair is a column number, and the second member is the code for the
//operation to perform on that column.
vector<QuickString> columnsVec;
vector<QuickString> opsVec;
int numCols = Tokenize(_columns, columnsVec, ',');
int numOps = Tokenize(_operations, opsVec, ',');
Tokenizer colTokens;
Tokenizer opsTokens;
int numCols = colTokens.tokenize(_columns, ',');
int numOps = opsTokens.tokenize(_operations, ',');
if (numOps < 1 || numCols < 1) {
cerr << endl << "*****" << endl
<< "***** ERROR: There must be at least one column and at least one operation named." << endl;
return false;
}
if (numOps > 1 && numCols != numOps) {
if (numOps > 1 && numCols > 1 && numCols != numOps) {
cerr << endl << "*****" << endl
<< "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl;
cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl;
cerr << "\ta single column to which all operations will be applied," << endl;
cerr << "\tor an operation for each column." << endl;
return false;
}
for (int i=0; i < (int)columnsVec.size(); i++) {
int col = str2chrPos(columnsVec[i]);
int loop = max(numCols, numOps);
for (int i=0; i < loop; i++) {
int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0));
//check that the column number is valid
if (col < 1 || col > dbFile->getNumFields()) {
......@@ -124,7 +127,7 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) {
<< dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl;
return false;
}
const QuickString &operation = opsVec.size() > 1 ? opsVec[i] : opsVec[0];
const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0);
OP_TYPES opCode = getOpCode(operation);
if (opCode == INVALID) {
cerr << endl << "*****" << endl
......@@ -361,4 +364,33 @@ const QuickString & KeyListOps::getOpVals(RecordKeyList &hits)
return _outVals;
}
void KeyListOpsHelp() {
cerr << "\t-c\t" << "Specify columns from the B file to map onto intervals in A." << endl;
cerr << "\t\tDefault: 5." << endl;
cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl;
cerr << "\t-o\t" << "Specify the operation that should be applied to -c." << endl;
cerr << "\t\tValid operations:" << endl;
cerr << "\t\t sum, min, max, absmin, absmax," << endl;
cerr << "\t\t mean, median," << endl;
cerr << "\t\t collapse (i.e., print a delimited list (duplicates allowed)), " << endl;
cerr << "\t\t distinct (i.e., print a delimited list (NO duplicates allowed)), " << endl;
cerr << "\t\t count" << endl;
cerr << "\t\t count_distinct (i.e., a count of the unique values in the column), " << endl;
cerr << "\t\tDefault: sum" << endl;
cerr << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl;
cerr << "\t\tIf there is only column, but multiple operations, all operations will be" << endl;
cerr << "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl;
cerr << "multiple columns, that operation will be applied to all columns." << endl;
cerr << "\t\tOtherwise, the number of columns must match the the number of operations," << endl;
cerr << "and will be applied in respective order." << endl;
cerr << "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl;
cerr << "the mean of column 4, and the count of column 6." << endl;
cerr << "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl;
cerr << "\t-delim\t" << "Specify a custom delimiter for the collapse operations." << endl;
cerr << "\t\t- Example: -delim \"|\"" << endl;
cerr << "\t\t- Default: \",\"." << endl << endl;
}
......@@ -12,6 +12,9 @@
class FileRecordMgr;
//print help message
void KeyListOpsHelp();
class KeyListOps {
public:
......
......@@ -13,7 +13,6 @@ FileRecordTypeChecker::FileRecordTypeChecker()
_isBed = false;
_isDelimited = false;
_delimChar = '\t'; //tab by default
_lines.clear();
_firstValidDataLineIdx = -1;
_isVCF = false;
_isBAM = false;
......@@ -161,9 +160,14 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len)
_fileType = SINGLE_LINE_DELIM_TEXT_FILE_TYPE;
//Tokenize the first line of valid data into fields.
const QuickString &line = _lines[_firstValidDataLineIdx];
_currLineElems.clear();
if (Tokenize(line, _currLineElems, _delimChar, _numFields) != _numFields) {
//Need to make a copy so next call to tokenizer doesn't overwrite the line.
QuickString line(_tokenizer.getElem(_firstValidDataLineIdx));
_tokenizer.setKeepFinalIncompleteElem(Tokenizer::USE_NOW);
_tokenizer.setNumExpectedItems(_numFields);
if (_tokenizer.tokenize(line, _delimChar) != _numFields) {
cerr << "Error: Type checker found wrong number of fields while tokenizing data line." << endl;
exit(1);
}
......@@ -173,7 +177,7 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len)
if (_numFields == 3) {
_recordType = BED3_RECORD_TYPE;
} else if (_numFields == 4) {
if (isNumeric(_currLineElems[3])) {
if (isNumeric(_tokenizer.getElem(3))) {
_recordType = BEDGRAPH_RECORD_TYPE;
_fourthFieldNumeric = true;
} else {
......@@ -223,12 +227,12 @@ bool FileRecordTypeChecker::isBedFormat() {
return false;
}
//the 2nd and 3rd fields must be numeric.
if (!isNumeric(_currLineElems[1]) || !isNumeric(_currLineElems[2])) {
if (!isNumeric(_tokenizer.getElem(1)) || !isNumeric(_tokenizer.getElem(2))) {
return false;
}
int start = str2chrPos(_currLineElems[1]);
int end = str2chrPos(_currLineElems[2]);
int start = str2chrPos(_tokenizer.getElem(1));
int end = str2chrPos(_tokenizer.getElem(2));
if (end < start) {
return false;
}
......@@ -242,11 +246,11 @@ bool FileRecordTypeChecker::isGFFformat()
return false;
}
//the 4th and 5th fields must be numeric.
if (!isNumeric(_currLineElems[3]) || !isNumeric(_currLineElems[4])) {
if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) {
return false;
}
int start = str2chrPos(_currLineElems[3]);
int end = str2chrPos(_currLineElems[4]);
int start = str2chrPos(_tokenizer.getElem(3));
int end = str2chrPos(_tokenizer.getElem(4));
if (end < start) {
return false;
}
......@@ -256,8 +260,8 @@ bool FileRecordTypeChecker::isGFFformat()
bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
{
//Break single string buffer into vector of QuickStrings. Delimiter is newline.
_lines.clear();
int numLines = Tokenize(buffer, _lines, '\n', len);
_tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE);
int numLines = _tokenizer.tokenize(buffer, '\n');
//anticipated delimiter characters are tab, comma, and semi-colon.
//If we need new ones, they must be added in this method.
......@@ -283,7 +287,7 @@ bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
if (validLinesFound >=4) {
break; //really only need to look at like 4 lines of data, max.
}
QuickString &line = _lines[i];
const QuickString &line = _tokenizer.getElem(i);
int len =line.size();
//skip over any empty line
if (len == 0) {
......
......@@ -18,6 +18,7 @@ using namespace std;
#include <vector>
#include <map>
#include "PushBackStreamBuf.h"
#include "Tokenizer.h"
class FileRecordTypeChecker {
public:
......@@ -87,8 +88,8 @@ private:
RECORD_TYPE _recordType;
QuickString _filename; //useful for reporting errors with file.
vector<QuickString> _lines;
vector<QuickString> _currLineElems;
Tokenizer _tokenizer;
int _firstValidDataLineIdx;
int _numBytesInBuffer; //this will hold the length of the buffer after the scan.
......
......@@ -9,8 +9,8 @@ INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp
OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o
SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp Tokenizer.h Tokenizer.h
OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o Tokenizer.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
all: $(BUILT_OBJECTS)
......@@ -23,6 +23,6 @@ $(BUILT_OBJECTS): $(SOURCES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o
@rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o $(OBJ_DIR)/Tokenizer.o
.PHONY: clean
......@@ -93,35 +93,7 @@ string vectorIntToStr(const vector<int> &vec) {
return str;
}
// TBD: Could be better optimized. I'm allocating 8KB for every call, then destroying it.
// That memory needs to stay in scope.
// Also, is this handling subsequent delimiters?
int Tokenize(const QuickString &str, vector<QuickString> &elems, char delimiter, int numExpectedItems) {
elems.reserve(numExpectedItems);
int strLen = (int)str.size();
int startPos = 0;
int currPos = 0;
char elemBuf[8192];
while (startPos < strLen) {
memset(elemBuf, 0, 8192);
while (str[currPos] != delimiter && currPos < strLen) {
currPos++;
}
if (currPos > startPos) {
memcpy(elemBuf, str.c_str() + startPos, min(currPos, strLen) - startPos);
elems.push_back(elemBuf);
}
startPos = currPos +1;
currPos = startPos;
}
return (int)elems.size();
}
bool isHeaderLine(QuickString &line) {
bool isHeaderLine(const QuickString &line) {
if (line[0] == '>') {
return true;
}
......@@ -143,4 +115,3 @@ bool isHeaderLine(QuickString &line) {
}
return false;
}
......@@ -86,7 +86,7 @@ void int2str(int number, T& buffer, bool appendToBuf = false)
}
bool isHeaderLine(QuickString &line);
bool isHeaderLine(const QuickString &line);
string vectorIntToStr(const vector<int> &vec);
......
/*
* Tokenizer.cpp
*
* Created on: Apr 15, 2014
* Author: nek3d
*/
#include "Tokenizer.h"
#include <cstring>
Tokenizer::Tokenizer()
: _numExpectedElems(0),
_keepFinalIncElem(USE_NOW),
_numValidElems(0) {
_elems.resize(INITIAL_NUM_ELEMS, NULL);
for (int i = 0; i < INITIAL_NUM_ELEMS; i++) {
_elems[i] = new QuickString();
}
}
Tokenizer::~Tokenizer() {
resize(0); //easy way to delete elems without repeating code.
}
void Tokenizer::setNumExpectedItems(int newSize) {
_numExpectedElems = newSize;
resize(newSize);
}
int Tokenizer::tokenize(const QuickString &str, char delimiter) {
int strLen = (int)str.size();
int startPos = 0;
int currPos = 0;
int currIdx = 0;
while (startPos < strLen) {
while (str[currPos] != delimiter && currPos < strLen) {
currPos++;
}
if (currPos > startPos) {
if (currPos == strLen && _keepFinalIncElem != USE_NOW) {
//we found an incomplete final element.
// if we're ignoring incomplete elems, do nothing with it.
currIdx--; //make sure it's not included in the final count of valid elems.
} else {
QuickString *newStr = fetchElem(currIdx);
newStr->assign(str.c_str() + startPos, min(currPos, strLen) - startPos);
}
}
startPos = currPos +1;
currPos = startPos;
currIdx++;
}
_numValidElems = currIdx;
return currIdx;
}
void Tokenizer::setKeepFinalIncompleteElem(lastElemCode code) {
_keepFinalIncElem = code;
}
QuickString *Tokenizer::fetchElem(int idx)
{
if (idx >= (int)_elems.size()) {
resize(idx +1);
}
return _elems[idx];
}
void Tokenizer::resize(int newSize) {
int oldSize = (int)_elems.size();
if (newSize > oldSize) { //need to add items.
_elems.resize(newSize);
for (int i=oldSize; i < newSize; i++) {
_elems[i] = new QuickString();
}
} else if (oldSize > newSize) {
//need to remove items.
for (int i = oldSize - 1; i >= newSize; i--) {
delete _elems[i];
_elems[i] = NULL;
}
_elems.resize(newSize);
}
//if oldSize is the same as newSize, do nothing.
}
/*
* Tokenizer.h
*
* Created on: Apr 15, 2014
* Author: nek3d
*/
#ifndef TOKENIZER_H_
#define TOKENIZER_H_
using namespace std;
#include "QuickString.h"
#include <vector>
class Tokenizer {
public:
Tokenizer();
~Tokenizer();
// If you know the expected number of items, set this.
// If not, don't worry about it.
void setNumExpectedItems(int val);
int tokenize(const QuickString &str, char delimiter = '\t');
// If the final element ends before a delim char, that means
// the buffer passed in ends mid-element. The last, incomplete
// element found can either be:
// 1) Used now. We want it whether it's complete or not.
// 3) Ignored altogether.
typedef enum { USE_NOW, IGNORE } lastElemCode;
void setKeepFinalIncompleteElem(lastElemCode code);
//final number of valid elems may be less than total number of elems,
//because elems are not necessarily deleted between subsequent calls
//to tokenizer.
int getNumValidElems() const { return _numValidElems; }
int getNumTotalElems() const { return (int)_elems.size(); }
const QuickString &getElem(int i) const { return (*(_elems[i])); }
private:
static const int DEFAULT_PARSE_BUFFER_SIZE = 4096; // 8Kb
static const int INITIAL_NUM_ELEMS = 10;
vector<QuickString *> _elems;
int _numExpectedElems;
lastElemCode _keepFinalIncElem;
int _numValidElems;
QuickString *fetchElem(int idx);
void resize(int newSize);
};
#endif /* TOKENIZER_H_ */
......@@ -691,10 +691,13 @@ rm obs exp
############################################################
echo " map.t46...\c"
echo \
"
*****
***** ERROR: There are 1 columns given, but there are 2 operations." > exp
$BT map -a ivls.bed -b values.bed -o count,sum 2>&1 > /dev/null | head -3 > obs
"chr1 0 100 3 30
chr1 100 200 1 1
chr2 0 100 0 .
chr2 100 200 0 .
chr3 0 100 3 6
chr3 100 200 1 4" > exp
$BT map -a ivls.bed -b values.bed -o count,sum > obs
check obs exp
rm obs exp
......
......@@ -18,7 +18,6 @@ check()
# chr1 45 100
###########################################################
# Test #1
# Test a basic merge; one interval should be un-merged,
# the other two should be merged.
###########################################################
......@@ -31,71 +30,49 @@ check obs exp
rm obs exp
###########################################################
#
# NOTE: Testing for sorted input is now deprecated, as the
# FileRecordMgr is already testing for that.
#
###########################################################
# Test #2
# Enforce coordinate sorted input.
###########################################################
#echo " merge.t2...\c"
#command -v tac 2>/dev/null || alias tac="sed '1!G;h;\$!d'"
#tac a.bed | $BT merge -i - 2> obs
#echo "ERROR: input file: (-) is not sorted by chrom then start.
# The start coordinate at line 3 is less than the start at line 2" > exp
#check obs exp
#rm obs exp
# Test that -n option is shown as deperecated
###########################################################
echo " merge.t2...\c"
echo "***** ERROR: -n option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp
$BT merge -i a.bed -n 2>&1 > /dev/null | head -2 | tail -1 > obs
check obs exp
rm obs exp
###########################################################
# Test #3
# Test the counting of merged intervals. (-n)
###########################################################
echo " merge.t3...\c"
echo \
"chr1 10 20 1
chr1 30 100 3" > exp
$BT merge -i a.bed -n > obs
$BT merge -i a.bed -c 1 -o count > obs
check obs exp
rm obs exp
###########################################################
# Test #4
# Test the listing of names from merged intervals. (-nms)
# a.bed should fail, as there is no name field
# Test that -nms option is deprecated
###########################################################
echo " merge.t4...\c"
echo \
"*****
***** ERROR: Requested column 4, but database file a.bed only has fields 1 - 3." > exp
$BT merge -i a.bed -nms 2>&1 > /dev/null | head -3 | tail -2 > obs
echo "***** ERROR: -nms option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp
$BT merge -i a.bed -nms 2>&1 > /dev/null | head -2 | tail -1 > obs
check obs exp
rm obs exp
###########################################################
# Test #5
# Test the listing of names from merged intervals. (-nms)
# a.named.bed should work, as there are name fields
#
# cat a.names.bed
# chr1 10 20 a1
# chr1 30 40 a2
# chr1 40 50 a3
# chr1 45 100 a4
# Test the listing of names from merged intervals.
###########################################################
echo " merge.t5...\c"
echo \
"chr1 10 20 a1
chr1 30 100 a2,a3,a4" > exp
$BT merge -i a.names.bed -nms > obs
$BT merge -i a.names.bed -c 4 -o collapse > obs
check obs exp
rm obs exp
###########################################################
# -nms and -scores sum
# collapsed list of the names, and sum of the scores
###########################################################
echo " merge.t6...\c"
echo \
......@@ -104,12 +81,12 @@ chr1 30 100 a2,a3,a4 9
chr2 10 20 a1 5
chr2 30 40 a2 6
chr2 42 100 a3,a4 15" > exp
$BT merge -i a.full.bed -nms -scores sum> obs
$BT merge -i a.full.bed -c 4,5 -o collapse,sum > obs
check obs exp
rm obs exp
###########################################################
# -n and -scores sum
# count intervals and sum of scores
###########################################################
echo " merge.t7...\c"
echo \
......@@ -118,12 +95,12 @@ chr1 30 100 3 9
chr2 10 20 1 5
chr2 30 40 1 6
chr2 42 100 2 15" > exp
$BT merge -i a.full.bed -n -scores sum> obs
$BT merge -i a.full.bed -c 5 -o count,sum> obs
check obs exp
rm obs exp
###########################################################
# -n, -nms, and -scores sum
# count, collapsed names, and sum of scores
###########################################################
echo " merge.t8...\c"
echo \
......@@ -132,12 +109,13 @@ chr1 30 100 a2,a3,a4 9 3
chr2 10 20 a1 5 1
chr2 30 40 a2 6 1
chr2 42 100 a3,a4 15 2" > exp
$BT merge -i a.full.bed -nms -scores sum -n> obs
$BT merge -i a.full.bed -c 4,5,4 -o collapse,sum,count > obs
check obs exp
rm obs exp
###########################################################
# -s, -n, -nms, and -scores sum
# stranded merge, show sign, collapsed names, sum of
# scores, and count
###########################################################
echo " merge.t9...\c"
echo \
......@@ -149,24 +127,17 @@ chr2 10 20 + a1 5 1
chr2 30 40 + a2 6 1
chr2 42 50 + a3 7 1
chr2 45 100 - a4 8 1" > exp
$BT merge -i a.full.bed -s -nms -scores sum -n> obs
$BT merge -i a.full.bed -s -c 6,4,5,6 -o distinct,collapse,sum,count > obs
check obs exp
rm obs exp
###########################################################
# Test #10
# Test the use of a custom delimiter for -nms
#
# cat a.names.bed
# chr1 10 20 a1
# chr1 30 40 a2
# chr1 40 50 a3
# chr1 45 100 a4
###########################################################
echo " merge.t10...\c"
echo \
"chr1 10 20 a1
chr1 30 100 a2|a3|a4" > exp
$BT merge -i a.names.bed -nms -delim "|" > obs
$BT merge -i a.names.bed -delim "|" -c 4 -o collapse > obs
check obs exp
rm obs exp
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment