Skip to content
Snippets Groups Projects
Commit e7dec439 authored by Aaron's avatar Aaron
Browse files

header and custom pattern search for nucBed. Thanks to Can Alkan.

parent b6ae6f5e
No related branches found
No related tags found
No related merge requests found
......@@ -13,12 +13,14 @@
#include "nucBed.h"
NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq) {
NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, bool hasPattern, const string &pattern) {
_dbFile = dbFile;
_bedFile = bedFile;
_printSeq = printSeq;
_hasPattern = hasPattern;
_pattern = pattern;
_bed = new BedFile(_bedFile);
// Compute the DNA content in each BED/GFF/VCF interval
......@@ -32,11 +34,15 @@ NucBed::~NucBed(void)
void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLength)
{
int a,c,g,t,n,other;
a = c = g = t = n = other = 0;
int a,c,g,t,n,other,userPatternCount;
a = c = g = t = n = other = userPatternCount = 0;
getDnaContent(sequence,a,c,g,t,n,other);
if (_hasPattern)
userPatternCount = countPattern(sequence, _pattern);
// report the original interval
_bed->reportBedTab(bed);
// report AT and GC content
......@@ -44,12 +50,41 @@ void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLen
// report raw nucleotide counts
printf("%d\t%d\t%d\t%d\t%d\t%d\t%d",a,c,g,t,n,other,seqLength);
// add the original sequence if requested.
if (_printSeq) {
printf("\t%s\n",sequence.c_str());
}
else {
printf("\n");
if (_printSeq)
printf("\t%s",sequence.c_str());
if (_hasPattern)
printf("\t%d",userPatternCount);
printf("\n");
}
void NucBed::PrintHeader(void) {
printf("#");
int numOrigColumns = (int) _bed->bedType;
for (int i = 1; i <= numOrigColumns; ++i) {
printf("%d_usercol\t", i);
}
printf("%d_pct_at\t", numOrigColumns + 1);
printf("%d_pct_gc\t", numOrigColumns + 2);
printf("%d_num_A\t", numOrigColumns + 3);
printf("%d_num_C\t", numOrigColumns + 4);
printf("%d_num_G\t", numOrigColumns + 5);
printf("%d_num_T\t", numOrigColumns + 6);
printf("%d_num_N\t", numOrigColumns + 7);
printf("%d_num_oth\t", numOrigColumns + 8);
printf("%d_seq_len\t", numOrigColumns + 9);
if (_printSeq)
printf("%d_seq", numOrigColumns + 10);
if (_hasPattern && !_printSeq)
printf("%d_user_patt_count\n", numOrigColumns + 10);
else if (_hasPattern && _printSeq)
printf("\t%d_user_patt_count\n", numOrigColumns + 11);
printf("\n");
}
......@@ -69,9 +104,10 @@ void NucBed::ProfileDNA() {
// open and memory-map genome file
FastaReference fr;
bool memmap = true;
bool memmap = true;
fr.open(_dbFile, memmap);
bool headerReported = false;
BED bed, nullBed;
int lineNum = 0;
BedLineStatus bedStatus;
......@@ -80,6 +116,10 @@ void NucBed::ProfileDNA() {
_bed->Open();
while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
if (bedStatus == BED_VALID) {
if (headerReported == false) {
PrintHeader();
headerReported = true;
}
// make sure we are extracting >= 1 bp
if (bed.zeroLength == false) {
size_t seqLength = fr.sequenceLength(bed.chrom);
......
......@@ -29,8 +29,7 @@ class NucBed {
public:
// constructor
NucBed(string &dbFile, string &bedFile, bool printSeq);
NucBed(string &dbFile, string &bedFile, bool printSeq, bool hasPattern, const string &pattern);
// destructor
~NucBed(void);
......@@ -41,10 +40,12 @@ private:
string _dbFile;
string _bedFile;
bool _printSeq;
bool _hasPattern;
string _pattern;
// instance of a bed file class.
BedFile *_bed;
void PrintHeader(void);
void ReportDnaProfile(const BED& bed, const string &sequence, int seqLength);
};
......
......@@ -32,11 +32,13 @@ int main(int argc, char* argv[]) {
// input files
string fastaDbFile;
string bedFile;
string pattern;
// checks for existence of parameters
bool haveFastaDb = false;
bool haveBed = false;
bool printSeq = false;
bool hasPattern = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
......@@ -74,6 +76,13 @@ int main(int argc, char* argv[]) {
else if(PARAMETER_CHECK("-seq", 4, parameterLength)) {
printSeq = true;
}
else if(PARAMETER_CHECK("-pattern", 8, parameterLength)) {
if ((i+1) < argc) {
hasPattern = true;
pattern = argv[i + 1];
i++;
}
}
else {
cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
......@@ -86,7 +95,7 @@ int main(int argc, char* argv[]) {
if (!showHelp) {
NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq);
NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq, hasPattern, pattern);
delete nuc;
return 0;
......@@ -107,8 +116,11 @@ void ShowHelp(void) {
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -fi <fasta> -bed <bed/gff/vcf>" << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-fi\tInput FASTA file" << endl;
cerr << "\t-fi\tInput FASTA file" << endl << endl;
cerr << "\t-bed\tBED/GFF/VCF file of ranges to extract from -fi" << endl << endl;
cerr << "\t-seq\tPrint the extracted sequence" << endl << endl;
cerr << "\t-pattern\tReport the number of times a user-defined sequence is observed (case-insensitive)." << endl << endl;
cerr << "Output format: " << endl;
cerr << "\tThe following information will be reported after each original BED entry:" << endl;
......@@ -120,8 +132,9 @@ void ShowHelp(void) {
cerr << "\t 6) Number of Ts observed" << endl;
cerr << "\t 7) Number of Ns observed" << endl;
cerr << "\t 8) Number of other bases observed" << endl;
cerr << "\t 9) The length of the explored sequence/interval." << endl << endl;
cerr << "\t 9) The length of the explored sequence/interval." << endl;
cerr << "\t 10) The sequence extracted from the FASTA file. (optional, if -seq is used)" << endl;
cerr << "\t 11) The number of times a user defined pattern was observed. (optional, if -pattern is used.)" << endl;
// end the program here
exit(1);
......
......@@ -108,3 +108,19 @@ void getDnaContent(const string &seq, int &a, int &c, int &g, int &t, int &n, in
}
}
}
int countPattern(const string &seq, const string &pattern)
{
// swap the bases
int patternLength = pattern.size();
int patternCount = 0;
for(unsigned int i = 0; i < seq.length(); i++) {
if (seq.substr(i,patternLength) == pattern) {
patternCount++;
}
}
return patternCount;
}
......@@ -22,4 +22,6 @@ void toUpperCase(string &seq);
// Calculates the number of a, c, g, t, n, and other bases found in a sequence
void getDnaContent(const string &seq, int &a, int &c, int &g, int &t, int &n, int &other);
int countPattern(const string &seq, const string &pattern);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment