From 097834ca03b5d6ad7ca613a520ed6a8254e9727a Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Tue, 30 Oct 2012 13:51:50 -0400 Subject: [PATCH] refactor VCF auto-detection --- src/intersectBed/intersectBed.cpp | 6 +- src/utils/bedFile/bedFile.cpp | 10 ++ src/utils/bedFile/bedFile.h | 181 ++++++++++++++---------------- 3 files changed, 97 insertions(+), 100 deletions(-) diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index 4f25c1b3..2f9783fa 100644 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -279,13 +279,13 @@ void BedIntersect::IntersectBed() { hit_set.second.reserve(10000); while (sweep.Next(hit_set)) { if (_obeySplits == false) { - processHits(hit_set.first, hit_set.second); - } + processHits(hit_set.first, hit_set.second); + } else { bedVector a_blocks; GetBedBlocks(hit_set.first, a_blocks); FindBlockedOverlaps(hit_set.first, a_blocks, - hit_set.second, false); + hit_set.second, false); } } } diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 681fafe0..2939b52f 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -154,6 +154,13 @@ void BedFile::GetHeader(void) { ) { _header += _bedLine + '\n'; + + if (_bedLine.find("##fileformat=VCF") == 0) { + _typeIsKnown = true; + setFileType(VCF_FILETYPE); + setGff(false); + setVcf(true); + } } // we are done with the header. stop looking // and indicate that the first data line has been read @@ -187,8 +194,11 @@ bool BedFile::GetNextBed(BED &bed, bool forceSorted) { // of reading the header. Tokenize(_bedLine, _bedFields); _firstLine = false; + setBedType(_bedFields.size()); } // load the BED struct as long as it's a valid BED entry. + + _numFields = _bedFields.size(); _status = parseLine(bed, _bedFields); if (_status == BED_INVALID) return false; diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index ac65eaab..31581031 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -492,6 +492,7 @@ private: string _header; bool _firstLine; vector<string> _bedFields; + unsigned int _numFields; int _merged_start; int _merged_end; string _merged_chrom; @@ -517,42 +518,38 @@ private: parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ template <typename T> - inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector) { + inline BedLineStatus parseLine (T &bed, const vector<string> &fields) { // clear out the data from the last line. bed = _nullBed; - unsigned int numFields = lineVector.size(); - // bail out if we have a blank line - if (numFields == 0) { + if (_numFields == 0) { return BED_BLANK; } // bail out if we have a comment line - if ( (lineVector[0].find("#") == 0) || - (lineVector[0].find("browser") == 0) || - (lineVector[0].find("track") == 0) + if ( (fields[0].find("#") == 0) || + (fields[0].find("browser") == 0) || + (fields[0].find("track") == 0) ) { return BED_HEADER; } - if (numFields >= 3) { + if (_numFields >= 3) { // line parsing for all lines after the first non-header line if (_typeIsKnown == true) { switch(_fileType) { case BED_FILETYPE: - if (parseBedLine(bed, lineVector, - _lineNum, numFields) == true) - {return BED_VALID;} - + if (parseBedLine(bed, fields) == true) + return BED_VALID; case VCF_FILETYPE: - if (parseVcfLine(bed, lineVector, - _lineNum, numFields) == true) - {return BED_VALID;} + if (parseVcfLine(bed, fields) == true) + { + return BED_VALID; + } case GFF_FILETYPE: - if (parseGffLine(bed, lineVector, - _lineNum, numFields) == true) - {return BED_VALID;} + if (parseGffLine(bed, fields) == true) + return BED_VALID; default: printf("ERROR: file type encountered. Exiting\n"); exit(1); @@ -561,61 +558,54 @@ private: // line parsing for first non-header line: figure out file contents else { // it's BED format if columns 2 and 3 are integers - if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { + if (isInteger(fields[1]) && isInteger(fields[2])) { setGff(false); setZeroBased(true); setFileType(BED_FILETYPE); // we now expect numFields columns in each line - setBedType(numFields); + setBedType(_numFields); // test to see if the file has true blocked BED12 records - if (numFields == 12) { - int cdsStart = atoi(lineVector[6].c_str()); - int cdsEnd = atoi(lineVector[7].c_str()); - int numExons = atoi(lineVector[9].c_str()); + if (_numFields == 12) { + int cdsStart = atoi(fields[6].c_str()); + int cdsEnd = atoi(fields[7].c_str()); + int numExons = atoi(fields[9].c_str()); if (cdsStart > 0 && cdsEnd > 0&& numExons > 0 && - lineVector[10].find(",") == 0 && - lineVector[11].find(",") == 0) + fields[10].find(",") == 0 && + fields[11].find(",") == 0) { setBed12(true); } else setBed12(false); } - if (parseBedLine(bed, lineVector, - _lineNum, numFields) == true) - { + if (parseBedLine(bed, fields) == true) return BED_VALID; - } } // it's VCF, assuming the second column is numeric and // there are at least 8 fields. - else if (isInteger(lineVector[1]) && numFields >= 8) { + else if (isInteger(fields[1]) && _numFields >= 8) { setGff(false); setVcf(true); setZeroBased(false); setFileType(VCF_FILETYPE); // we now expect numFields columns in each line - setBedType(numFields); - if (parseVcfLine(bed, lineVector, - _lineNum, numFields) == true) - { + setBedType(_numFields); + if (parseVcfLine(bed, fields) == true) return BED_VALID; - } } // it's GFF, assuming columns columns 4 and 5 are numeric // and we have 9 fields total. - else if ((numFields >= 8) && - isInteger(lineVector[3]) && - isInteger(lineVector[4])) + else if ((_numFields >= 8) && + isInteger(fields[3]) && + isInteger(fields[4])) { setGff(true); setZeroBased(false); setFileType(GFF_FILETYPE); // we now expect numFields columns in each line - setBedType(numFields); - if (parseGffLine(bed, lineVector, - _lineNum, numFields) == true) + setBedType(_numFields); + if (parseGffLine(bed, fields) == true) { return BED_VALID; } @@ -648,16 +638,15 @@ private: parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ template <typename T> - inline bool parseBedLine (T &bed, const vector<string> &lineVector, - int _lineNum, unsigned int numFields) + inline bool parseBedLine (T &bed, const vector<string> &fields) { // process as long as the number of fields in this // line matches what we expect for this file. - if (numFields == this->bedType) { - bed.fields = lineVector; - bed.chrom = lineVector[0]; + if (_numFields == this->bedType) { + bed.fields = fields; + bed.chrom = fields[0]; int i; - i = atoi(lineVector[1].c_str()); + i = atoi(fields[1].c_str()); if (i<0) { cerr << "Error: malformed BED entry at line " << _lineNum @@ -666,7 +655,7 @@ private: exit(1); } bed.start = (CHRPOS)i; - i = atoi(lineVector[2].c_str()); + i = atoi(fields[2].c_str()); if (i<0) { cerr << "Error: malformed BED entry at line " << _lineNum @@ -684,22 +673,22 @@ private: } if (this->bedType == 4) { - bed.name = lineVector[3]; + bed.name = fields[3]; } else if (this->bedType == 5) { - bed.name = lineVector[3]; - bed.score = lineVector[4]; + bed.name = fields[3]; + bed.score = fields[4]; } else if (this->bedType == 6) { - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = lineVector[5]; + bed.name = fields[3]; + bed.score = fields[4]; + bed.strand = fields[5]; } else if (this->bedType > 6) { - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = lineVector[5]; - for (unsigned int i = 6; i < lineVector.size(); ++i) { + bed.name = fields[3]; + bed.score = fields[4]; + bed.strand = fields[5]; + for (unsigned int i = 6; i < fields.size(); ++i) { bed.other_idxs.push_back(i); } } @@ -724,21 +713,21 @@ private: exit(1); } } - else if (numFields == 1) { + else if (_numFields == 1) { cerr << "Only one BED field detected: " << _lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; exit(1); } - else if ((numFields != this->bedType) && (numFields != 0)) { + else if ((_numFields != this->bedType) && (_numFields != 0)) { cerr << "Differing number of BED fields encountered at line: " << _lineNum << ". Exiting..." << endl; exit(1); } - else if ((numFields < 3) && (numFields != 0)) { + else if ((_numFields < 3) && (_numFields != 0)) { cerr << "TAB delimited BED file with at least 3 fields" << " (chrom, start, end) is required at line: " << _lineNum @@ -754,26 +743,25 @@ private: parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ template <typename T> - inline bool parseVcfLine (T &bed, const vector<string> &lineVector, - int _lineNum, unsigned int numFields) + inline bool parseVcfLine (T &bed, const vector<string> &fields) { - if (numFields == this->bedType) { - bed.fields = lineVector; - bed.chrom = lineVector[0]; + if (_numFields >= this->bedType) { + bed.fields = fields; + bed.chrom = fields[0]; // VCF is one-based - bed.start = atoi(lineVector[1].c_str()) - 1; + bed.start = atoi(fields[1].c_str()) - 1; // VCF 4.0 stores the size of the affected REF allele. - bed.end = bed.start + lineVector[3].size(); + bed.end = bed.start + fields[3].size(); bed.strand = "+"; // construct the name from the ref and alt alleles. // if it's an annotated variant, add the rsId as well. - bed.name = lineVector[3] + "/" + lineVector[4]; - if (lineVector[2] != ".") { - bed.name += "_" + lineVector[2]; + bed.name = fields[3] + "/" + fields[4]; + if (fields[2] != ".") { + bed.name += "_" + fields[2]; } if (this->bedType > 2) { - for (unsigned int i = 2; i < numFields; ++i) + for (unsigned int i = 2; i < _numFields; ++i) bed.other_idxs.push_back(i); } @@ -795,7 +783,7 @@ private: exit(1); } } - else if (numFields == 1) { + else if (_numFields == 1) { cerr << "Only one VCF field detected: " << _lineNum << ". Verify that your files are TAB-delimited. " @@ -803,14 +791,14 @@ private: << endl; exit(1); } - else if ((numFields != this->bedType) && (numFields != 0)) { + else if ((_numFields != this->bedType) && (_numFields != 0)) { cerr << "Differing number of VCF fields encountered at line: " << _lineNum << ". Exiting..." << endl; exit(1); } - else if ((numFields < 2) && (numFields != 0)) { + else if ((_numFields < 2) && (_numFields != 0)) { cerr << "TAB delimited VCF file with at least 2 fields " << "(chrom, pos) is required at line: " << _lineNum @@ -827,20 +815,19 @@ private: parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ template <typename T> - inline bool parseGffLine (T &bed, const vector<string> &lineVector, - int lineNum, unsigned int numFields) + inline bool parseGffLine (T &bed, const vector<string> &fields) { - if (numFields == this->bedType) { - bed.fields = lineVector; + if (_numFields == this->bedType) { + bed.fields = fields; if (this->bedType >= 8 && _isGff) { - bed.chrom = lineVector[0]; - if (isInteger(lineVector[3])) - bed.start = atoi(lineVector[3].c_str()); - if (isInteger(lineVector[4])) - bed.end = atoi(lineVector[4].c_str()); - bed.name = lineVector[2]; - bed.score = lineVector[5]; - bed.strand = lineVector[6].c_str(); + bed.chrom = fields[0]; + if (isInteger(fields[3])) + bed.start = atoi(fields[3].c_str()); + if (isInteger(fields[4])) + bed.end = atoi(fields[4].c_str()); + bed.name = fields[2]; + bed.score = fields[5]; + bed.strand = fields[6].c_str(); // add GFF "source". unused in BED bed.other_idxs.push_back(1); // add GFF "fname". unused in BED @@ -853,7 +840,7 @@ private: } else { cerr << "Error: unexpected number of fields at line: " - << lineNum + << _lineNum << ". Verify that your files are TAB-delimited and that " << "your GFF file has 8 or 9 fields. Exiting..." << endl; @@ -861,38 +848,38 @@ private: } if (bed.start > bed.end) { cerr << "Error: malformed GFF entry at line " - << lineNum + << _lineNum << ". Start was greater than end. Exiting." << endl; exit(1); } if ( (bed.start < 0) || (bed.end < 0) ) { cerr << "Error: malformed GFF entry at line " - << lineNum + << _lineNum << ". Coordinate detected that is < 1. Exiting." << endl; exit(1); } return true; } - else if (numFields == 1) { + else if (_numFields == 1) { cerr << "Only one GFF field detected: " - << lineNum + << _lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; exit(1); } - else if ((numFields != this->bedType) && (numFields != 0)) { + else if ((_numFields != this->bedType) && (_numFields != 0)) { cerr << "Differing number of GFF fields encountered at line: " - << lineNum + << _lineNum << ". Exiting..." << endl; exit(1); } - else if ((numFields < 8) && (numFields != 0)) { + else if ((_numFields < 8) && (_numFields != 0)) { cerr << "TAB delimited GFF file with 8 or 9 fields is required" << " at line: " - << lineNum + << _lineNum << ". Exiting..." << endl; exit(1); -- GitLab