Skip to content
Snippets Groups Projects
Commit accf0a13 authored by Timothee Flutre's avatar Timothee Flutre
Browse files

add option -fullHeader for fasta files

parent d5ecb195
No related branches found
No related tags found
No related merge requests found
......@@ -17,14 +17,15 @@
Bed2Fa::Bed2Fa(bool useName, const string &dbFile,
const string &bedFile, const string &fastaOutFile,
bool useFasta, bool useStrand,
bool useBlocks) :
bool useBlocks, bool useFullHeader) :
_useName(useName),
_dbFile(dbFile),
_bedFile(bedFile),
_fastaOutFile(fastaOutFile),
_useFasta(useFasta),
_useStrand(useStrand),
_useBlocks(useBlocks)
_useBlocks(useBlocks),
_useFullHeader(useFullHeader)
{
_bed = new BedFile(_bedFile);
......@@ -110,7 +111,7 @@ void Bed2Fa::ReportDNA(const BED &bed, string &dna) {
//******************************************************************************
void Bed2Fa::ExtractDNA() {
/* Make sure that we can oen all of the files successfully*/
/* Make sure that we can open all of the files successfully*/
// open the fasta database for reading
ifstream faDb(_dbFile.c_str(), ios::in);
......@@ -124,7 +125,7 @@ void Bed2Fa::ExtractDNA() {
// open and memory-map genome file
FastaReference *fr = new FastaReference;
bool memmap = true;
fr->open(_dbFile, memmap);
fr->open(_dbFile, memmap, _useFullHeader);
BED bed, nullBed;
string sequence;
......
......@@ -33,7 +33,7 @@ public:
Bed2Fa(bool useName, const string &dbFile,
const string &bedFile, const string &fastaOutFile,
bool useFasta, bool useStrand,
bool useBlocks);
bool useBlocks, bool useFullHeader);
// destructor
~Bed2Fa(void);
......@@ -52,6 +52,7 @@ private:
bool _useStrand; // should the extracted sequence obey strandedness?
bool _useBlocks; // should the extracted sequence obey BED blocks
// (for example, exons?)
bool _useFullHeader;
// instance of a bed file class.
BedFile *_bed;
......
......@@ -44,6 +44,7 @@ int fastafrombed_main(int argc, char* argv[]) {
bool useFasta = true;
bool useStrand = false;
bool useBlocks = false;
bool useFullHeader = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
......@@ -97,6 +98,9 @@ int fastafrombed_main(int argc, char* argv[]) {
else if(PARAMETER_CHECK("-s", 2, parameterLength)) {
useStrand = true;
}
else if(PARAMETER_CHECK("-fullHeader", 11, parameterLength)) {
useFullHeader = true;
}
else {
cerr << "*****ERROR: Unrecognized parameter: "
<< argv[i]
......@@ -115,7 +119,7 @@ int fastafrombed_main(int argc, char* argv[]) {
Bed2Fa *b2f = new Bed2Fa(useNameOnly, fastaDbFile,
bedFile, fastaOutFile,
useFasta, useStrand,
useBlocks);
useBlocks, useFullHeader);
delete b2f;
}
else {
......@@ -148,6 +152,9 @@ void fastafrombed_help(void) {
<< endl;
cerr << "\t\tstrand, the sequence will be reverse complemented." << endl;
cerr << "\t\t- By default, strand information is ignored." << endl << endl;
cerr << "\t-fullHeader\tUse full fasta header." << endl;
cerr << "\t\t- By default, only the word before the first space or tab "
<< "is used." << endl << endl;
// end the program here
exit(1);
......
......@@ -15,16 +15,17 @@
NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq,
bool hasPattern, const string &pattern,
bool forceStrand, bool ignoreCase)
bool forceStrand, bool ignoreCase, bool useFullHeader)
{
_dbFile = dbFile;
_bedFile = bedFile;
_printSeq = printSeq;
_hasPattern = hasPattern;
_pattern = pattern;
_forceStrand = forceStrand;
_ignoreCase = ignoreCase;
_dbFile = dbFile;
_bedFile = bedFile;
_printSeq = printSeq;
_hasPattern = hasPattern;
_pattern = pattern;
_forceStrand = forceStrand;
_ignoreCase = ignoreCase;
_useFullHeader = useFullHeader;
_bed = new BedFile(_bedFile);
// Compute the DNA content in each BED/GFF/VCF interval
......@@ -109,7 +110,7 @@ void NucBed::ProfileDNA() {
// open and memory-map genome file
FastaReference fr;
bool memmap = true;
fr.open(_dbFile, memmap);
fr.open(_dbFile, memmap, _useFullHeader);
bool headerReported = false;
BED bed;
......
......@@ -31,7 +31,7 @@ public:
// constructor
NucBed(string &dbFile, string &bedFile, bool printSeq,
bool hasPattern, const string &pattern,
bool forceStrand, bool ignoreCase);
bool forceStrand, bool ignoreCase, bool useFullHeader);
// destructor
~NucBed(void);
......@@ -46,6 +46,7 @@ private:
string _pattern;
bool _forceStrand;
bool _ignoreCase;
bool _useFullHeader;
// instance of a bed file class.
BedFile *_bed;
......
......@@ -35,12 +35,13 @@ int nuc_main(int argc, char* argv[]) {
string pattern;
// checks for existence of parameters
bool haveFastaDb = false;
bool haveBed = false;
bool printSeq = false;
bool hasPattern = false;
bool forceStrand = false;
bool ignoreCase = false;
bool haveFastaDb = false;
bool haveBed = false;
bool printSeq = false;
bool hasPattern = false;
bool forceStrand = false;
bool ignoreCase = false;
bool useFullHeader = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
......@@ -90,6 +91,9 @@ int nuc_main(int argc, char* argv[]) {
pattern = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-useFullHeader", 11, parameterLength)) {
useFullHeader = true;
}
else {
cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
......@@ -104,7 +108,8 @@ int nuc_main(int argc, char* argv[]) {
if (!showHelp) {
NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq,
hasPattern, pattern, forceStrand, ignoreCase);
hasPattern, pattern, forceStrand, ignoreCase,
useFullHeader);
delete nuc;
}
else {
......@@ -134,8 +139,12 @@ void nuc_help(void) {
cerr << "\t-pattern\tReport the number of times a user-defined sequence" << endl;
cerr << "\t\t\tis observed (case-sensitive)." << endl << endl;
cerr << "\t-C\tIgore case when matching -pattern. By defaulty, case matters." << endl << endl;
cerr << "\t-C\tIgnore case when matching -pattern. By defaulty, case matters." << endl << endl;
cerr << "\t-fullHeader\tUse full fasta header." << endl;
cerr << "\t\t- By default, only the word before the first space or tab "
<< "is used." << endl << endl;
cerr << "Output format: " << endl;
cerr << "\tThe following information will be reported after each BED entry:" << endl;
cerr << "\t 1) %AT content" << endl;
......
......@@ -8,12 +8,13 @@
#include "Fasta.h"
FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len)
FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len, bool useFullHeader)
: name(name)
, length(length)
, offset(offset)
, line_blen(line_blen)
, line_len(line_len)
, useFullHeader(useFullHeader)
{}
FastaIndexEntry::FastaIndexEntry(void) // empty constructor
......@@ -30,16 +31,19 @@ void FastaIndexEntry::clear(void)
// check if we have already recorded a real offset
line_blen = NULL;
line_len = NULL;
useFullHeader = false;
}
ostream& operator<<(ostream& output, const FastaIndexEntry& e) {
// just write the first component of the name, for compliance with other tools
output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" <<
e.line_blen << "\t" << e.line_len;
output << (e.useFullHeader? e.name : split(e.name, ' ').at(0))
<< "\t" << e.length << "\t" << e.offset << "\t"
<< e.line_blen << "\t" << e.line_len;
return output; // for multiple << operators.
}
FastaIndex::FastaIndex(void)
FastaIndex::FastaIndex(bool useFullHeader)
: useFullHeader(useFullHeader)
{}
void FastaIndex::readIndexFile(string fname) {
......@@ -55,12 +59,14 @@ void FastaIndex::readIndexFile(string fname) {
if (fields.size() == 5) { // if we don't get enough fields then there is a problem with the file
// note that fields[0] is the sequence name
char* end;
string name = split(fields[0], " \t").at(0); // key by first token of name
string name = useFullHeader ? fields[0] :
split(fields[0], " \t").at(0); // key by first token of name
sequenceNames.push_back(name);
this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()),
strtoll(fields[2].c_str(), &end, 10),
atoi(fields[3].c_str()),
atoi(fields[4].c_str()))));
strtoll(fields[2].c_str(), &end, 10),
atoi(fields[3].c_str()),
atoi(fields[4].c_str()),
useFullHeader)));
} else {
cerr << "Warning: malformed fasta index file " << fname <<
"does not have enough fields @ line " << linenum << endl;
......@@ -181,12 +187,13 @@ void FastaIndex::indexReference(string refname) {
}
void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) {
string name = split(entry.name, " \t").at(0); // key by first token of name
string name = useFullHeader? entry.name :
split(entry.name, " \t").at(0); // key by first token of name
sequenceNames.push_back(name);
this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length,
entry.offset, entry.line_blen,
entry.line_len)));
entry.offset, entry.line_blen,
entry.line_len,
useFullHeader)));
}
void FastaIndex::writeIndexFile(string fname) {
......@@ -225,13 +232,15 @@ bool FastaIndex::chromFound(string name) {
string FastaIndex::indexFileExtension() { return ".fai"; }
void FastaReference::open(string reffilename, bool usemmap) {
void FastaReference::open(string reffilename, bool usemmap, bool useFullHeader) {
filename = reffilename;
if (!(file = fopen(filename.c_str(), "r"))) {
cerr << "could not open " << filename << endl;
exit(1);
}
index = new FastaIndex();
if (useFullHeader)
usingfullheader = true;
index = new FastaIndex(useFullHeader);
struct stat stFileInfo;
string indexFileName = filename + index->indexFileExtension();
// if we can find an index file, use it
......
......@@ -29,7 +29,8 @@ using namespace std;
class FastaIndexEntry {
friend ostream& operator<<(ostream& output, const FastaIndexEntry& e);
public:
FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len);
FastaIndexEntry(string name, int length, long long offset,
int line_blen, int line_len, bool useFullHeader);
FastaIndexEntry(void);
~FastaIndexEntry(void);
string name; // sequence name
......@@ -37,14 +38,16 @@ class FastaIndexEntry {
long long offset; // bytes offset of sequence from start of file
int line_blen; // line length in bytes, sequence characters
int line_len; // line length including newline
bool useFullHeader;
void clear(void);
};
class FastaIndex : public map<string, FastaIndexEntry> {
friend ostream& operator<<(ostream& output, FastaIndex& i);
public:
FastaIndex(void);
FastaIndex(bool useFullHeader);
~FastaIndex(void);
bool useFullHeader;
vector<string> sequenceNames;
void indexReference(string refName);
void readIndexFile(string fname);
......@@ -58,10 +61,12 @@ class FastaIndex : public map<string, FastaIndexEntry> {
class FastaReference {
public:
void open(string reffilename, bool usemmap = false);
void open(string reffilename, bool usemmap = false,
bool useFullHeader = false);
bool usingmmap;
string filename;
FastaReference(void) : usingmmap(false) { }
bool usingfullheader;
FastaReference(void) : usingmmap(false), usingfullheader(false) { }
~FastaReference(void);
FILE* file;
void* filemm;
......
......@@ -59,3 +59,12 @@ if [ "$SEQ" != "tag" ]; then
else
echo ok
fi
# test -fullHeader
echo " getfasta.t06...\c"
LINES=$(echo $'chr1 assembled by consortium X\t1\t10' | $BT getfasta -fullHeader -fi t_fH.fa -bed stdin -fo - | awk 'END{ print NR }')
if [ "$LINES" != "2" ]; then
echo fail
else
echo ok
fi
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment