Skip to content
Snippets Groups Projects
Commit 05a364e4 authored by nkindlon's avatar nkindlon
Browse files

added more utils

parent 1435a95a
No related branches found
No related tags found
No related merge requests found
/*****************************************************************************
GenomeFile.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licensed under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "GenomeFile.h"
GenomeFile::GenomeFile(const string &genomeFile) {
_genomeFile = genomeFile;
loadGenomeFileIntoMap();
}
GenomeFile::GenomeFile(const RefVector &genome) {
for (size_t i = 0; i < genome.size(); ++i) {
string chrom = genome[i].RefName;
int length = genome[i].RefLength;
_chromSizes[chrom] = length;
_chromList.push_back(chrom);
}
}
// Destructor
GenomeFile::~GenomeFile(void) {
}
void GenomeFile::loadGenomeFileIntoMap() {
string genomeLine;
int lineNum = 0;
vector<string> genomeFields; // vector for a GENOME entry
// open the GENOME file for reading
ifstream genome(_genomeFile.c_str(), ios::in);
if ( !genome ) {
cerr << "Error: The requested genome file (" << _genomeFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
while (getline(genome, genomeLine)) {
Tokenize(genomeLine,genomeFields); // load the fields into the vector
lineNum++;
// ignore a blank line
if (genomeFields.size() > 0) {
if (genomeFields[0].find("#") != 0) {
// we need at least 2 columns
if (genomeFields.size() >= 2) {
char *p2End;
long c2;
// make sure the second column is numeric.
c2 = strtol(genomeFields[1].c_str(), &p2End, 10);
// strtol will set p2End to the start of the string if non-integral, base 10
if (p2End != genomeFields[1].c_str()) {
string chrom = genomeFields[0];
int size = atoi(genomeFields[1].c_str());
_chromSizes[chrom] = size;
_chromList.push_back(chrom);
_startOffsets.push_back(_genomeLength);
_genomeLength += size;
}
}
else {
cerr << "Less than the req'd two fields were encountered in the genome file (" << _genomeFile << ")";
cerr << " at line " << lineNum << ". Exiting." << endl;
exit (1);
}
}
}
genomeFields.clear();
}
}
pair<string, uint32_t> GenomeFile::projectOnGenome(uint32_t genome_pos) {
// search for the chrom that the position belongs on.
// add 1 to genome position b/c of zero-based, half open.
vector<uint32_t>::const_iterator low =
lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1);
// use the iterator to identify the appropriate index
// into the chrom name and start vectors
int i = int(low-_startOffsets.begin());
string chrom = _chromList[i - 1];
uint32_t start = genome_pos - _startOffsets[i - 1];
return make_pair(chrom, start);
}
uint32_t GenomeFile::getChromSize(const string &chrom) {
chromToSizes::const_iterator chromIt = _chromSizes.find(chrom);
if (chromIt != _chromSizes.end())
return _chromSizes[chrom];
else
return -1; // chrom not found.
}
uint32_t GenomeFile::getGenomeSize(void) {
return _genomeLength;
}
vector<string> GenomeFile::getChromList() {
return _chromList;
}
int GenomeFile::getNumberOfChroms() {
return _chromList.size();
}
string GenomeFile::getGenomeFileName() {
return _genomeFile;
}
/*****************************************************************************
GenomeFile.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licensed under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef GENOMEFILE_H
#define GENOMEFILE_H
#include <map>
#include <string>
#include <iostream>
#include <sstream>
#include <fstream>
#include <cstring>
#include <cstdio>
#include <algorithm> // for bsearch lower_bound()
#include "api/BamReader.h"
#include "api/BamAux.h"
using namespace BamTools;
using namespace std;
// typedef for mapping b/w chrom name and it's size in b.p.
typedef map<string, int, std::less<string> > chromToSizes;
class GenomeFile {
public:
// Constructor using a file
GenomeFile(const string &genomeFile);
// Constructor using a vector of BamTools RefVector
GenomeFile(const RefVector &genome);
// Destructor
~GenomeFile(void);
// load a GENOME file into a map keyed by chrom. value is size of chrom.
void loadGenomeFileIntoMap();
pair<string, uint32_t> projectOnGenome(uint32_t genome_pos);
uint32_t getChromSize(const string &chrom); // return the size of a chromosome
uint32_t getGenomeSize(void); // return the total size of the geonome
vector<string> getChromList(); // return a list of chrom names
int getNumberOfChroms(); // return the number of chroms
string getGenomeFileName(); // return the name of the genome file
private:
string _genomeFile;
chromToSizes _chromSizes;
vector<string> _chromList;
// projecting chroms into a single coordinate system
uint32_t _genomeLength;
vector<uint32_t> _startOffsets;
};
#endif /* GENOMEFILE_H */
OBJ_DIR = ../../../obj/
BIN_DIR = ../../../bin/
UTILITIES_DIR = ../
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/BamTools/include/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= NewGenomeFile.cpp NewGenomeFile.h GenomeFile.cpp GenomeFile.h
OBJECTS= NewGenomeFile.o GenomeFile.o
#_EXT_OBJECTS=lineFileUtilities.o fileType.o
#EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
$(BUILT_OBJECTS): $(SOURCES) $(SUBDIRS)
@echo " * compiling GenomeFile.cpp"
@$(CXX) -c -o $(OBJ_DIR)/GenomeFile.o GenomeFile.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
@echo " * compiling NewGenomeFile.cpp"
@$(CXX) -c -o $(OBJ_DIR)/NewGenomeFile.o NewGenomeFile.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
#$(EXT_OBJECTS):
# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/NewGenomeFile.o $(OBJ_DIR)/GenomeFile.o
.PHONY: clean
/*****************************************************************************
NewGenomeFile.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licensed under the GNU General Public License 2.0 license.
******************************************************************************/
#include "NewGenomeFile.h"
NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename)
: _maxId(-1)
{
_genomeFileName = genomeFilename;
loadGenomeFileIntoMap();
}
NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector)
: _maxId(-1)
{
for (size_t i = 0; i < refVector.size(); ++i) {
QuickString chrom = refVector[i].RefName;
CHRPOS length = refVector[i].RefLength;
_maxId++;
_chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId);
}
}
// Destructor
NewGenomeFile::~NewGenomeFile(void) {
}
void NewGenomeFile::loadGenomeFileIntoMap() {
FILE *fp = fopen(_genomeFileName.c_str(), "r");
if (fp == NULL) {
fprintf(stderr, "Error: Can't open genome file %s. Exiting...\n", _genomeFileName.c_str());
exit(1);
}
char sLine[2048];
char chrName[2048];
CHRPOS chrSize = 0;
while (!feof(fp)) {
memset(sLine, 0, 2048);
memset(chrName, 0, 2048);
chrSize = 0;
fgets(sLine, 2048, fp);
sscanf(sLine, "%s %d", chrName, &chrSize);
if (strlen(sLine) == 0) {
continue;
}
_maxId++;
_chromSizeIds[chrName] = pair<CHRPOS, CHRPOS>(chrSize, _maxId);
_startOffsets.push_back(_genomeLength);
_genomeLength += chrSize;
_chromList.push_back(chrName);
}
_startOffsets.push_back(_genomeLength); //insert the final length as the last element
//to help with the lower_bound call in the projectOnGenome method.
fclose(fp);
}
bool NewGenomeFile::projectOnGenome(CHRPOS genome_pos, QuickString &chrom, CHRPOS &start) {
// search for the chrom that the position belongs on.
// add 1 to genome position b/c of zero-based, half open.
vector<CHRPOS>::const_iterator low =
lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1);
// use the iterator to identify the appropriate index
// into the chrom name and start vectors
CHRPOS i = CHRPOS(low-_startOffsets.begin());
if (i < 0 || i >= _chromList.size()) {
return false; //position not on genome
}
chrom = _chromList[i - 1];
start = genome_pos - _startOffsets[i - 1];
return true;
}
CHRPOS NewGenomeFile::getChromSize(const QuickString &chrom) {
if (chrom == _currChromName) {
return _currChromSize;
}
lookupType::const_iterator iter= _chromSizeIds.find(chrom);
if (iter != _chromSizeIds.end()) {
_currChromName = iter->first;
_currChromSize = iter->second.first;
_currChromId = iter->second.second;
return _currChromSize;
}
return INT_MAX;
}
CHRPOS NewGenomeFile::getChromId(const QuickString &chrom) {
if (chrom == _currChromName) {
return _currChromId;
}
lookupType::const_iterator iter= _chromSizeIds.find(chrom);
if (iter != _chromSizeIds.end()) {
_currChromName = iter->first;
_currChromSize = iter->second.first;
_currChromId = iter->second.second;
return _currChromId;
}
return INT_MAX;
}
/*****************************************************************************
NewGenomeFile.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licensed under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef NEW_GENOMEFILE_H
#define NEW_GENOMEFILE_H
#include "BedtoolsTypes.h"
#include "api/BamReader.h"
#include "api/BamAux.h"
class NewGenomeFile {
public:
NewGenomeFile(const QuickString &genomeFileName);
NewGenomeFile(const BamTools::RefVector &genome);
~NewGenomeFile(void);
// load a GENOME file into a map keyed by chrom. value is a pair<int, int> of id and size.
void loadGenomeFileIntoMap();
bool projectOnGenome(CHRPOS genome_pos, QuickString &chrom, CHRPOS &start);
CHRPOS getGenomeSize(void) const { return _genomeLength; } // return the total size of the genome
CHRPOS getChromSize(const QuickString &chrom); // return the size of a chromosome
CHRPOS getChromId(const QuickString &chrom); // return chromosome's sort order
const vector<QuickString> &getChromList() const { return _chromList; } // return a list of chrom names
CHRPOS getNumberOfChroms() const { return _chromList.size(); }
const QuickString &getGenomeFileName() const { return _genomeFileName; }
private:
QuickString _genomeFileName;
typedef map<QuickString, pair<CHRPOS, CHRPOS> > lookupType;
lookupType _chromSizeIds;
vector<QuickString> _chromList;
CHRPOS _maxId;
// projecting chroms onto a single coordinate system
CHRPOS _genomeLength;
vector<CHRPOS> _startOffsets;
//cache members for quick lookup
QuickString _currChromName;
CHRPOS _currChromSize;
CHRPOS _currChromId;
};
#endif /* GENOMEFILE_H */
OBJ_DIR = ../../../obj/
BIN_DIR = ../../../bin/
UTILITIES_DIR = ../../utils/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/BamTools/include
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= NewChromsweep.cpp NewChromsweep.h
OBJECTS= NewChromsweep.o
_EXT_OBJECTS=
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/NewChromsweep.o $(BIN_DIR)/NewChromsweep.o
.PHONY: clean
\ No newline at end of file
/*****************************************************************************
NewChromsweep.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "NewChromsweep.h"
#include "Context.h"
#include "FileRecordMgr.h"
NewChromSweep::NewChromSweep(Context *context,
bool useMergedIntervals)
: _context(context),
_queryFRM(NULL),
_databaseFRM(NULL),
_useMergedIntervals(false),
_queryRecordsTotalLength(0),
_databaseRecordsTotalLength(0),
_wasInitialized(false),
_currQueryRec(NULL),
_currDatabaseRec(NULL)
{
}
bool NewChromSweep::init() {
//Create new FileRecordMgrs for the two input files.
//Open them, and get the first record from each.
//if any of that goes wrong, return false;
//otherwise, return true.
_queryFRM = new FileRecordMgr(_context->getQueryFileIdx(), _context);
_databaseFRM = new FileRecordMgr(_context->getDatabaseFileIdx(), _context);
if (!_queryFRM->open()) {
return false;
}
if (!_databaseFRM->open()) {
return false;
}
_context->determineOutputType();
nextRecord(false);
if (_currDatabaseRec == NULL) {
return false;
}
_wasInitialized = true;
return true;
}
void NewChromSweep::closeOut() {
while (!_queryFRM->eof()) {
nextRecord(true);
}
while (!_databaseFRM->eof()) {
nextRecord(false);
}
}
NewChromSweep::~NewChromSweep(void) {
if (!_wasInitialized) {
return;
}
if (_currQueryRec != NULL) {
_queryFRM->deleteRecord(_currQueryRec);
}
if (_currDatabaseRec != NULL) {
_databaseFRM->deleteRecord(_currDatabaseRec);
}
_queryFRM->close();
_databaseFRM->close();
delete _queryFRM;
delete _databaseFRM;
}
void NewChromSweep::scanCache() {
recListIterType cacheIter = _cache.begin();
while (cacheIter != _cache.end())
{
const Record *cacheRec = cacheIter->value();
if (_currQueryRec->sameChrom(cacheRec) && !(_currQueryRec->after(cacheRec))) {
if (intersects(_currQueryRec, cacheRec)) {
_hits.push_back(cacheRec);
}
cacheIter = _cache.next();
}
else {
cacheIter = _cache.deleteCurrent();
_databaseFRM->deleteRecord(cacheRec);
}
}
}
void NewChromSweep::clearCache()
{
//delete all objects pointed to by cache
for (recListIterType iter = _cache.begin(); iter != _cache.end(); iter = _cache.next()) {
_databaseFRM->deleteRecord(iter->value());
}
_cache.clear();
}
bool NewChromSweep::chromChange()
{
// the files are on the same chrom
if (_currDatabaseRec == NULL || _currQueryRec->sameChrom(_currDatabaseRec)) {
return false;
}
// the query is ahead of the database. fast-forward the database to catch-up.
if (_currQueryRec->chromAfter(_currDatabaseRec)) {
while (_currDatabaseRec != NULL &&
_currQueryRec->chromAfter(_currDatabaseRec)) {
_databaseFRM->deleteRecord(_currDatabaseRec);
nextRecord(false);
}
clearCache();
return false;
}
// the database is ahead of the query.
else {
// 1. scan the cache for remaining hits on the query's current chrom.
if (_currQueryRec->getChrName() == _currChromName)
{
scanCache();
}
// 2. fast-forward until we catch up and report 0 hits until we do.
else if (_currQueryRec->chromBefore(_currDatabaseRec))
{
clearCache();
}
return true;
}
}
bool NewChromSweep::next(RecordKeyList &next) {
if (_currQueryRec != NULL) {
_queryFRM->deleteRecord(_currQueryRec);
}
nextRecord(true);
if (_currQueryRec == NULL) { //eof hit!
return false;
}
if (_currDatabaseRec == NULL && _cache.empty()) {
return false;
}
_hits.clear();
_currChromName = _currQueryRec->getChrName();
// have we changed chromosomes?
if (!chromChange()) {
// scan the database cache for hits
scanCache();
//skip if we hit the end of the DB
// advance the db until we are ahead of the query. update hits and cache as necessary
while (_currDatabaseRec != NULL &&
_currQueryRec->sameChrom(_currDatabaseRec) &&
!(_currDatabaseRec->after(_currQueryRec))) {
if (intersects(_currQueryRec, _currDatabaseRec)) {
_hits.push_back(_currDatabaseRec);
}
if (_currQueryRec->after(_currDatabaseRec)) {
_databaseFRM->deleteRecord(_currDatabaseRec);
_currDatabaseRec = NULL;
} else {
_cache.push_back(_currDatabaseRec);
_currDatabaseRec = NULL;
}
nextRecord(false);
}
}
next.setKey(_currQueryRec);
next.setListNoCopy(_hits);
return true;
}
void NewChromSweep::nextRecord(bool query) {
if (query) {
if (!_context->getUseMergedIntervals()) {
_currQueryRec = _queryFRM->allocateAndGetNextRecord();
} else {
_currQueryRec = _queryFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND);
}
if (_currQueryRec != NULL) {
_queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos());
}
} else { //database
if (!_context->getUseMergedIntervals()) {
_currDatabaseRec = _databaseFRM->allocateAndGetNextRecord();
} else {
_currDatabaseRec = _databaseFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND);
}
if (_currDatabaseRec != NULL) {
_databaseRecordsTotalLength += (unsigned long)(_currDatabaseRec->getEndPos() - _currDatabaseRec->getStartPos());
}
}
}
bool NewChromSweep::intersects(const Record *rec1, const Record *rec2) const
{
return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(),
_context->getOverlapFraction(), _context->getReciprocal());
}
/*****************************************************************************
NewChromsweepBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef NEW_CHROMSWEEP_H
#define NEW_CHROMSWEEP_H
using namespace std;
#include <string>
#include "BTlist.h"
#include "RecordKeyList.h"
#include <queue>
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include "QuickString.h"
class Record;
class FileRecordMgr;
class Context;
class NewChromSweep {
public:
NewChromSweep(Context *context, bool useMergedIntervals = false);
~NewChromSweep(void);
bool init();
typedef BTlist<const Record *> recListType;
typedef const BTlistNode<const Record *> * recListIterType;
// loads next (a pair) with the current query and it's overlaps
bool next(RecordKeyList &next);
//MUST call this method after sweep if you want the
//getTotalRecordLength methods to return the whole length of the
//record files, rather than just what was used by sweep.
void closeOut();
unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; }
unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; }
// Usage:
// NewChromSweep sweep = NewChromSweep(queryFileName, databaseFileName);
// RecordKeyList hits;
// while (sweep.next(hits))
// {
// processHits(hits);
// }
// getQueryTotalRecordLength()
// getDatabaseTotalRecordLength()
private:
Context *_context;
FileRecordMgr *_queryFRM;
FileRecordMgr *_databaseFRM;
bool _useMergedIntervals;
unsigned long _queryRecordsTotalLength;
unsigned long _databaseRecordsTotalLength;
bool _wasInitialized;
// a cache of still active features from the database file
recListType _cache;
// the set of hits in the database for the current query
recListType _hits;
// the current query and db features.
Record * _currQueryRec;
Record *_currDatabaseRec;
// a cache of the current chrom from the query. used to handle chrom changes.
QuickString _currChromName;
void nextRecord(bool query); //true fetches next query record, false fetches next db record.
void nextDatabase();
void scanCache();
void clearCache();
bool chromChange();
bool intersects(const Record *rec1, const Record *rec2) const;
};
#endif /* NewChromSweep_H */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment