diff --git a/src/nekSandbox1/nekSandboxMain.cpp b/src/nekSandbox1/nekSandboxMain.cpp index be5aa61b847f2d105a50a2797800f410c72f11ca..bfb86b4a2ee12a642e61b6ea1fafc271bc8a3d1e 100644 --- a/src/nekSandbox1/nekSandboxMain.cpp +++ b/src/nekSandbox1/nekSandboxMain.cpp @@ -31,6 +31,11 @@ int nek_sandbox1_main2(int argc,char** argv); int nek_sandbox1_main(int argc,char** argv) { +// for (int i=0; i < 4000; i++) { +// cout << "# This is line " << i << " of a file with a large header." << endl; +// } +// return 0; + if (argc < 2) { cerr << "Error: Need one input file. Use \"-\" for stdin." << endl; } @@ -165,7 +170,7 @@ int nek_sandbox1_main(int argc,char** argv) frm.deleteRecord(record); } - cout << "Final header is: " << frm.getHeader() << endl; +// cout << "Final header is: " << frm.getHeader() << endl; frm.close(); return 0; diff --git a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp index 6a4da6f73b82aa30c924e9469ba1fb424a1e9750..733fa218b431ea5448d970361d492ee5867d49a6 100644 --- a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp +++ b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp @@ -57,26 +57,30 @@ bool BufferedStreamMgr::init() _useBufSize = MAIN_BUF_READ_SIZE; } + size_t trueBufSize = max(_useBufSize, (int)_currScanBuffer.size()); + _useBufSize = trueBufSize; _mainBuf = new bufType[_useBufSize +1]; memset(_mainBuf, 0, _useBufSize +1); + memcpy(_mainBuf, _currScanBuffer.c_str(), _currScanBuffer.size()); + _mainBufCurrLen = _currScanBuffer.size(); return true; } bool BufferedStreamMgr::getTypeData() { - QuickString currScanBuffer; - _inputStreamMgr->getSavedData(currScanBuffer); + _currScanBuffer = _inputStreamMgr->getSavedData(); do { - if (!_typeChecker.scanBuffer(currScanBuffer.c_str(), currScanBuffer.size()) && !_typeChecker.needsMoreData()) { + if (!_typeChecker.scanBuffer(_currScanBuffer.c_str(), _currScanBuffer.size()) && !_typeChecker.needsMoreData()) { return false; } else if (_typeChecker.needsMoreData()) { _inputStreamMgr->populateScanBuffer(); - currScanBuffer.clear(); - _inputStreamMgr->getSavedData(currScanBuffer); + _currScanBuffer.append(_inputStreamMgr->getSavedData()); } } while (_typeChecker.needsMoreData()); - _inputStreamMgr->reset(); + if (_inputStreamMgr->resetStream()) { + _currScanBuffer.clear(); + } return true; } diff --git a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h index 6bdc0640cc113efcb88e664851c3ef7d9d249177..424766c5b3864c9f1e60a193a953610196ddc84b 100644 --- a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h +++ b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h @@ -42,6 +42,7 @@ private: bool _eof; int _useBufSize; bool _streamFinished; + QuickString _currScanBuffer; //The minus ones in these constants are for leaving room for a null terminator after reading into buffers. static const int MAIN_BUF_READ_SIZE = 67108863; //64 Mb minus 1 static const int TYPE_CHECK_READ_SIZE = 4095; // 4K diff --git a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp index f7d466cadee7f0b20f1168081971edc01a892890..073e48580a26a704d44cdcd5e506edc96fa4b1ba 100644 --- a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp +++ b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp @@ -23,7 +23,9 @@ InputStreamMgr::InputStreamMgr(const QuickString &filename, bool buildScanBuffer _isGzipped(false), _isBam(false), _isBgzipped(false), + _tmpZipBuf(NULL), _bamRuledOut(false), + _streamFinished(false), _numBytesInBuffer(0), _bamReader(NULL), _bgStream(NULL) @@ -61,6 +63,10 @@ InputStreamMgr::~InputStreamMgr() { delete _finalInputStream; _finalInputStream = NULL; } + if (_tmpZipBuf != NULL) { + delete [] _tmpZipBuf; + _tmpZipBuf = NULL; + } } bool InputStreamMgr::init() @@ -92,26 +98,54 @@ bool InputStreamMgr::init() //now we have a PushBackStreamBuf. Make a new stream. _finalInputStream = new istream(_pushBackStreamBuf); populateScanBuffer(); +// resetStream(); return true; } int InputStreamMgr::read(char *data, size_t dataSize) { + size_t origRead = 0; + if (!_saveDataStr.empty()) { + //must first copy contents of savedData into requested data read buffer. + if (dataSize >= _saveDataStr.size()) { + //They asked for the same amount of data or more than we saved. Give them all the saved data, + //then decrement the requested data size accordingly. + origRead = _saveDataStr.size(); + memcpy(data, _saveDataStr.c_str(), origRead); + data += origRead; + dataSize -= origRead; + _saveDataStr.clear(); + } else { + //This part is tricky. They want less data than we saved. Give them what they + //requested, then delete from the front of the saveDataStr by using it's substr method. + memcpy(data, _saveDataStr.c_str(), dataSize); + QuickString newDataStr; + _saveDataStr.substr(newDataStr, dataSize, _saveDataStr.size() - dataSize); + _saveDataStr = newDataStr; + return dataSize; + } + } + if (_streamFinished) { + return origRead; + } if (_isBgzipped) { - return (int)(_bgStream->Read(data, dataSize)); + return (int)(origRead + _bgStream->Read(data, dataSize)); } _finalInputStream->read(data, dataSize); - return _finalInputStream->gcount(); + return origRead + _finalInputStream->gcount(); } void InputStreamMgr::populateScanBuffer() { _scanBuffer.clear(); + _saveDataStr.clear(); int numChars=0; int currChar = 0; - bool mustAppend = true; while (1) { - mustAppend = true; + if (_isGzipped && _bamRuledOut) { + readZipChunk(); + return; + } currChar = _pushBackStreamBuf->sbumpc(); //Stop when EOF hit. if (currChar == EOF) { @@ -120,35 +154,28 @@ void InputStreamMgr::populateScanBuffer() numChars++; _scanBuffer.push_back(currChar); if (_isGzipped) { - if (!_bamRuledOut && detectBamOrBgzip(numChars, currChar, mustAppend)) { + if (!_bamRuledOut && detectBamOrBgzip(numChars, currChar)) { return; } if (numChars == 0) { continue; //this will only happen when we've just discovered that this //is definitely not BAM, and want to start over. } - if (mustAppend) { - _compressedSaveData.push_back(currChar); - } } - //For non-gzip, also stop if we have the minimum number of bytes and newline is hit. + //Stop if we have the minimum number of bytes and newline is hit. //For gzip, stop at SCAN_BUFFER_SIZE. - if ((!_isGzipped && (currChar == '\n' && numChars >= MIN_SCAN_BUFFER_SIZE )) || (_isGzipped && numChars >= SCAN_BUFFER_SIZE)) { + if (currChar == '\n' && numChars >= MIN_SCAN_BUFFER_SIZE ){ break; } } _numBytesInBuffer = _scanBuffer.size(); - //append it to the savedDataStr. If it's gzipped, decompress it first. - if (_isGzipped) { - decompressBuffer(); - } else { - _scanBuffer.toStr(_saveDataStr, true); - } + //append it to the savedDataStr. + _scanBuffer.toStr(_saveDataStr, true); } -bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar, bool &mustAppend) +bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar) { //Look for the BAM magic string "BAM\1" in the first fouur characters of the input stream. //In compressed form, the first char is the gzip signifier, which was already found. @@ -181,14 +208,12 @@ bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar, bool &mustApp QuickString bamHeader(_bamReader->GetHeaderText()); if (bamHeader.empty()) { - //This is NOT a bam file. + //This is NOT a bam file, but it is bgzipped. _pushBackStreamBuf->clear(); - _compressedSaveData.clear(); //Put all bytes read so far back onto the scan buffer, then reset //everything so that we're effectively starting over. _pushBackStreamBuf->pushBack(_scanBuffer); _scanBuffer.clear(); - mustAppend = false; numChars = 0; _isBam = false; _isBgzipped = true; @@ -196,69 +221,81 @@ bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar, bool &mustApp _numBytesInBuffer = 0; delete _bamReader; _bamReader = NULL; + + //Alter the finalInputSream to become a bgzfReader. + _bgStream = new BamTools::Internal::BgzfStream(); + _bgStream->OpenStream(_finalInputStream, BamTools::IBamIODevice::ReadOnly); + return false; } + //This is a BAM file. _isBam = true; _numBytesInBuffer = _scanBuffer.size(); return true; + } else if (numChars == 4) { + //This is a gzipped file, and it is not bgzipped or BAM. + _pushBackStreamBuf->clear(); + _pushBackStreamBuf->pushBack(_scanBuffer); + _scanBuffer.clear(); + numChars = 0; + _isBam = false; + _isBgzipped = false; + _bamRuledOut = true; + _numBytesInBuffer = 0; + _infStreamBuf = new InflateStreamBuf(_finalInputStream); + if (_oldInputStream != NULL) { + delete _oldInputStream; + } + _oldInputStream = _finalInputStream; + _finalInputStream = new istream(_infStreamBuf); + return false; } } return false; } -void InputStreamMgr::decompressBuffer() -{ - //allocate an array to hold uncompressed data. - uInt maxDecompressSize = 20 * _numBytesInBuffer; - unsigned char *newScanBuffer = new unsigned char[maxDecompressSize]; - memset(newScanBuffer, 0, maxDecompressSize); - - unsigned int numDecompressChars = inflateGzippedArray(_scanBuffer, newScanBuffer, maxDecompressSize, _numBytesInBuffer); - - // newScanBuffer should now contain uncompressed data. - //delete old buffer, point it at new buffer. - _saveDataStr.append((char *)newScanBuffer, numDecompressChars); +//void InputStreamMgr::decompressBuffer() +//{ +// //allocate an array to hold uncompressed data. +// _saveDataStr.clear(); +// uInt maxDecompressSize = 20 * _numBytesInBuffer; +// unsigned char *newScanBuffer = new unsigned char[maxDecompressSize]; +// memset(newScanBuffer, 0, maxDecompressSize); +// +// unsigned int numDecompressChars = inflateGzippedArray(_scanBuffer, newScanBuffer, maxDecompressSize, _numBytesInBuffer); +// +// // newScanBuffer should now contain uncompressed data. +// //delete old buffer, point it at new buffer. +// _saveDataStr.append((char *)newScanBuffer, numDecompressChars); +// +// delete [] newScanBuffer; +//} - delete [] newScanBuffer; +void InputStreamMgr::readZipChunk() +{ + if (_tmpZipBuf == NULL) { + _tmpZipBuf = new char[SCAN_BUFFER_SIZE +1]; + } + memset(_tmpZipBuf, 0, SCAN_BUFFER_SIZE +1); + size_t numCharsRead = read(_tmpZipBuf, (size_t)SCAN_BUFFER_SIZE); + _saveDataStr.append(_tmpZipBuf); + _numBytesInBuffer = _saveDataStr.size(); + if (numCharsRead < SCAN_BUFFER_SIZE) { + _streamFinished = true; + } + return; } -void InputStreamMgr::reset() +bool InputStreamMgr::resetStream() { - if (_isBam) { - return; - } - if (!_isStdin) { - //For file input, just re-open the file. + _saveDataStr.clear(); + if (!_isBam && !_isStdin && !_isGzipped) { + //For non-compressed, non-stdin file input, just re-open the file. delete _finalInputStream; _finalInputStream = new ifstream(_filename.c_str()); - } else { - if (_isBgzipped) { - for (BTlist<int>::const_iterator_type iter = _pushBackStreamBuf->_buffer.begin(); - iter != _pushBackStreamBuf->_buffer.end(); iter = _pushBackStreamBuf->_buffer.next()) { - _compressedSaveData.push_back(iter->value()); - } - _pushBackStreamBuf->clear(); - _pushBackStreamBuf->pushBack(_compressedSaveData); - } else if (_isGzipped) { - _pushBackStreamBuf->pushBack(_compressedSaveData); - } else { - _pushBackStreamBuf->pushBack(BTlist<int>(_saveDataStr)); - } -// _finalInputStream = new istream(_pushBackStreamBuf); - } - if (_isBgzipped) { - //The file is bgzipped, but not BAM. - _bgStream = new BamTools::Internal::BgzfStream(); - _bgStream->OpenStream(_finalInputStream, BamTools::IBamIODevice::ReadOnly); - } else if (_isGzipped) { - //the file is gzipped, but is not bgzipped or BAM. - _infStreamBuf = new InflateStreamBuf(_finalInputStream); - if (_oldInputStream != NULL) { - delete _oldInputStream; - } - _oldInputStream = _finalInputStream; - _finalInputStream = new istream(_infStreamBuf); + return true; } + return false; } diff --git a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h index 7a11d4e655770a39bd60665f3eecd5e5dd897cb3..1c463d37ebf5769c5b4ddf90c3181579fcee1fc3 100644 --- a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h +++ b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h @@ -31,13 +31,14 @@ public: const BTlist<int> &getScanBuffer() const { return _scanBuffer; } int getBufferLength() const { return _numBytesInBuffer; } void populateScanBuffer(); - void reset(); const QuickString &getSavedData() const { return _saveDataStr; } bool isGzipped() const { return _isGzipped; } PushBackStreamBuf *getPushBackStreamBuf() const {return _pushBackStreamBuf; } - void getSavedData(QuickString &str) const { str = _saveDataStr; } +// void getSavedData(QuickString &str) const { str = _saveDataStr; } bool isBam() const { return _isBam; } BamTools::BamReader *getBamReader() { return _bamReader; } + bool resetStream(); + private: QuickString _filename; @@ -45,7 +46,6 @@ private: ifstream *_inputFileStream; BTlist<int> _scanBuffer; QuickString _saveDataStr; - BTlist<int> _compressedSaveData; InflateStreamBuf *_infStreamBuf; istream * _finalInputStream; istream *_oldInputStream; @@ -53,7 +53,9 @@ private: bool _isGzipped; bool _isBam; bool _isBgzipped; + char *_tmpZipBuf; bool _bamRuledOut; + bool _streamFinished; vector<int> _possibleBamCode; static const int SCAN_BUFFER_SIZE = 4096; // 4 K buffer static const int BAM_SCAN_BUFFER_SIZE = 32768; // 32K @@ -63,8 +65,9 @@ private: BamTools::Internal::BgzfStream *_bgStream; static const char *FIFO_STRING_LITERAL; - bool detectBamOrBgzip(int &numChars, int currChar, bool &mustAppend); - void decompressBuffer(); + void readZipChunk(); + bool detectBamOrBgzip(int &numChars, int currChar); +// void decompressBuffer(); }; diff --git a/src/utils/general/QuickString.cpp b/src/utils/general/QuickString.cpp index 1229a245547ade6ab08f491d9fc7062c2e226845..831f84ab9eaa3574ab316c9c0256531a4b3608f1 100644 --- a/src/utils/general/QuickString.cpp +++ b/src/utils/general/QuickString.cpp @@ -160,6 +160,7 @@ void QuickString::set(const char *inBuf, size_t newLen) { } void QuickString::reserve(size_t newLen) { + newLen++; //always leave room for a null termninator. if (_currCapacity <= newLen) { while (_currCapacity <= newLen) { _currCapacity = _currCapacity << 1; diff --git a/test/intersect/a.bam b/test/intersect/a.bam index bf9f1fc23f810b8d8904396b991ca91c9dcc602e..f56cccfd481592b05ccbd3b3c0eff295b88a61e9 100644 Binary files a/test/intersect/a.bam and b/test/intersect/a.bam differ diff --git a/test/intersect/aVSb.bam b/test/intersect/aVSb.bam index 04eb46945a6a1e4a5a1147390ba7f9f31aafaea6..911fa82e48030b838cd8b7188c2b78c4528f79a0 100644 Binary files a/test/intersect/aVSb.bam and b/test/intersect/aVSb.bam differ diff --git a/test/intersect/new_test-intersect.sh b/test/intersect/new_test-intersect.sh index b9cd3ac7ef46e60f511da6073f63dc1b4287360d..72f07c7ba5f38ba30872a382e3699e1b1b96336b 100755 --- a/test/intersect/new_test-intersect.sh +++ b/test/intersect/new_test-intersect.sh @@ -289,4 +289,148 @@ check obs exp rm obs exp +########################################################### +# Test intersection of a with large header as bed from file vs b as bed from file +############################################################ +echo " intersect.new.t24...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a_withLargeHeader.bed -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bed from redirect vs b as bed from file +############################################################ +echo " intersect.new.t25...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a_withLargeHeader.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bed from pipe vs b as bed from file +############################################################ +echo " intersect.new.t26...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a_withLargeHeader.bed | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bed from fifo vs b as bed from file +############################################################ +echo " intersect.new.t27...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a_withLargeHeader.bed) -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as gzipped from file vs b as bed from file +############################################################ +echo " intersect.new.t28...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a_withLargeHeader_gzipped.bed.gz -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as gzipped from redirect vs b as bed from file +############################################################ +echo " intersect.new.t29...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a_withLargeHeader_gzipped.bed.gz > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as gzipped from pipe vs b as bed from file +############################################################ +echo " intersect.new.t30...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a_withLargeHeader_gzipped.bed.gz | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as gzipped from fifo vs b as bed from file +############################################################ +echo " intersect.new.t31...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a_withLargeHeader_gzipped.bed.gz) -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bgzipped from file vs b as bed from file +############################################################ +echo " intersect.new.t32...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a_withLargeHeader_bgzipped.bed.gz -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bgzipped from redirect vs b as bed from file +############################################################ +echo " intersect.new.t33...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a_withLargeHeader_bgzipped.bed.gz > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bgzipped from pipe vs b as bed from file +############################################################ +echo " intersect.new.t34...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a_withLargeHeader_bgzipped.bed.gz | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bgzipped from fifo vs b as bed from file +############################################################ +echo " intersect.new.t35...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a_withLargeHeader_bgzipped.bed.gz) -b b.bed > obs +check obs exp +rm obs exp + +