Skip to content
Snippets Groups Projects
Commit 2f90e6a7 authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Merge pull request #139 from nkindlon/master

fixes for bugs 100, 133, 136
parents 31878f3c 858cc4bd
No related branches found
No related tags found
No related merge requests found
Showing
with 187 additions and 15 deletions
......@@ -32,7 +32,7 @@ ContextBase::ContextBase()
_writeOverlap(false),
_writeAllOverlap(false),
_haveFraction(false),
_overlapFraction(1E-9),
_overlapFraction(0.0),
_reciprocal(false),
_sameStrand(false),
_diffStrand(false),
......
......@@ -137,3 +137,20 @@ bool SingleLineDelimTextFileReader::findDelimiters() {
}
return true;
}
int SingleLineDelimTextFileReader::getVcfSVlen() {
int startPos = _delimPositions[VCF_TAG_FIELD] +1;
const char *startPtr = strstr(_sLine.c_str() + startPos, "SVLEN=") +6;
const char *endPtr = strchr(startPtr, ';');
const char *midPtr = strchr(startPtr, ',');
int endCoord = -1;
if (midPtr != NULL && midPtr < endPtr) {
//comma found in the number, that means there are two numbers
int num1 = str2chrPos(startPtr, midPtr - startPtr);
int num2 = str2chrPos(midPtr +1, endPtr - (midPtr +1));
endCoord = max(abs(num1), abs(num2));
} else {
endCoord = abs(str2chrPos(startPtr, endPtr - startPtr));
}
return endCoord;
}
......@@ -13,6 +13,10 @@
class SingleLineDelimTextFileReader : public FileReader {
public:
//Allow VCF records to access a specialized private method.
//See end of class declaration for details.
friend class VcfRecord;
SingleLineDelimTextFileReader(int numFields, char delimChar = '\t');
~SingleLineDelimTextFileReader();
......@@ -24,6 +28,8 @@ public:
virtual void appendField(int fieldNum, QuickString &str) const;
virtual const QuickString &getHeader() const { return _header; }
virtual bool hasHeader() const { return _fullHeaderFound; }
protected:
int _numFields;
char _delimChar;
......@@ -38,6 +44,12 @@ protected:
bool detectAndHandleHeader();
bool findDelimiters();
//This is actually a very specialized function strictly for VCF
//records to read and process extra information about Structural Variants.
static const int VCF_TAG_FIELD = 7;
int getVcfSVlen();
};
......
......@@ -160,15 +160,27 @@ bool Record::sameChromIntersects(const Record *record,
int otherStart = record->getStartPos();
int otherEnd = record->getEndPos();
bool otherZeroLen = (otherStart - otherEnd == 0);
int maxStart = max(_startPos, otherStart);
int minEnd = min(_endPos, otherEnd);
bool localZeroLen = (_endPos - _startPos == 0);
//rule out all cases of no intersection at all
if (minEnd < maxStart) {
return false;
}
if (overlapFraction == 0.0) {
//don't care about amount of overlap.
//however, if minEnd and maxStart are equal, and
//neither record is zeroLen, return false.
if (minEnd == maxStart && !otherZeroLen && !localZeroLen) {
return false;
}
return true;
}
int overlapBases = minEnd - maxStart;
int len = _endPos - _startPos;
int otherLen = otherEnd - otherStart;
......
......@@ -18,13 +18,17 @@ bool VcfRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
_startPos--; // VCF is one-based. Here we intentionally don't decrement the string version,
//because we'll still want to output the one-based number in the print methods, even though
//internally we decrement the integer to comply with the 0-based format common to other records.
fileReader->getField(3, _varAlt);
//endPos is just the startPos plus the length of the variant
_endPos = _startPos + _varAlt.size();
fileReader->getField(4, _varAlt);
fileReader->getField(3, _varRef);
if (_varAlt[0] == '<') {
//this is a structural variant. Need to parse the tags to find the endpos.
_endPos = _startPos + fileReader->getVcfSVlen();
} else {
//endPos is just the startPos plus the length of the variant
_endPos = _startPos + _varRef.size();
}
int2str(_endPos, _endPosStr);
fileReader->getField(2, _name);
fileReader->getField(4, _varRef);
fileReader->getField(5, _score);
return initOtherFieldsFromFile(fileReader);
......@@ -33,8 +37,8 @@ bool VcfRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
void VcfRecord::clear()
{
BedPlusInterval::clear();
_varAlt.release();
_varRef.release();
_varAlt.release();
}
void VcfRecord::print(QuickString &outBuf) const {
......@@ -70,10 +74,10 @@ void VcfRecord::printOtherFields(QuickString &outBuf) const {
outBuf.append('\t');
outBuf.append(_name);
outBuf.append('\t');
outBuf.append(_varAlt);
outBuf.append('\t');
outBuf.append(_varRef);
outBuf.append('\t');
outBuf.append(_varAlt);
outBuf.append('\t');
outBuf.append(_score);
for (int i= 0; i < (int)_otherIdxs.size(); i++) {
outBuf.append('\t');
......@@ -98,10 +102,10 @@ const QuickString &VcfRecord::getField(int fieldNum) const
case 3:
return _name;
case 4:
return _varAlt;
return _varRef;
break;
case 5:
return _varRef;
return _varAlt;
break;
case 6:
return _score;
......
......@@ -32,8 +32,8 @@ public:
virtual const QuickString &getField(int fieldNum) const;
protected:
QuickString _varAlt;
QuickString _varRef;
QuickString _varAlt;
void printOtherFields(QuickString &outBuf) const;
};
......
......@@ -210,6 +210,7 @@ void RecordOutputMgr::printRecord(RecordKeyVector &keyList, RecordKeyVector *blo
if ((static_cast<ContextIntersect *>(_context))->getWriteAllOverlap()) {
// -wao the user wants to force the reporting of 0 overlap
if (printKeyAndTerminate(keyList)) {
_currBamBlockList = NULL;
return;
}
......@@ -223,6 +224,7 @@ void RecordOutputMgr::printRecord(RecordKeyVector &keyList, RecordKeyVector *blo
else if ((static_cast<ContextIntersect *>(_context))->getLeftJoin()) {
if (printKeyAndTerminate(keyList)) {
_currBamBlockList = NULL;
return;
}
tab();
......@@ -230,11 +232,13 @@ void RecordOutputMgr::printRecord(RecordKeyVector &keyList, RecordKeyVector *blo
newline();
if (needsFlush()) flush();
_currBamBlockList = NULL;
return;
}
} else {
if (printBamRecord(keyList, true) == BAM_AS_BAM) {
_currBamBlockList = NULL;
return;
}
int hitIdx = 0;
......@@ -252,14 +256,17 @@ void RecordOutputMgr::printRecord(RecordKeyVector &keyList, RecordKeyVector *blo
newline();
}
_currBamBlockList = NULL;
return;
} else if (_context->getProgram() == ContextBase::MAP) {
printKeyAndTerminate(keyList);
_currBamBlockList = NULL;
return;
} else if (_context->getProgram() == ContextBase::MERGE) {
printKeyAndTerminate(keyList);
_currBamBlockList = NULL;
return;
}
}
......@@ -380,6 +387,7 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record
newline();
if (needsFlush()) flush();
}
const_cast<Record *>(keyRecord)->adjustZeroLength();
}
void RecordOutputMgr::reportOverlapSummary(RecordKeyVector &keyList)
......
......@@ -2,6 +2,9 @@
#include <climits>
#include <cctype>
#include <cstring>
#include <cstdio>
#include <cstdlib>
#include <sstream>
//This functions recognizes only numbers with digits, plus sign, minus sign, decimal point, e, or E. Hexadecimal and pointers not currently supported.
bool isNumeric(const QuickString &str) {
......@@ -19,12 +22,26 @@ int str2chrPos(const QuickString &str) {
}
int str2chrPos(const char *str, size_t ulen) {
if (ulen == 0) {
ulen = strlen(str);
}
//first test for exponents / scientific notation
bool hasExponent = false;
for (size_t i=0; i < ulen; i++) {
if (str[i] == 'e' || str[i] == 'E') {
std::istringstream ss(str);
double retVal;
ss >> retVal;
return (int)retVal;
}
}
int len=(int)ulen;
if (len < 1 || len > 10) {
return INT_MIN; //can't do more than 9 digits and a minus sign
fprintf(stderr, "***** ERROR: too many digits/characters for integer conversion in string %s. Exiting...\n", str);
exit(1);
}
register int sum=0;
......@@ -39,9 +56,15 @@ int str2chrPos(const char *str, size_t ulen) {
for (int i=startPos; i < len; i++) {
char currChar = str[i];
if (currChar == 'e' || currChar == 'E') {
//default to atoi for scientific notation
return atoi(str);
}
if (!isdigit(currChar)) {
return INT_MIN;
fprintf(stderr, "***** ERROR: illegal character '%c' found in integer conversion of string %s. Exiting...\n", currChar, str);
exit(1);
}
int dig = currChar - 48; //ascii code for zero.
int power = len -i -1;
......@@ -77,7 +100,7 @@ int str2chrPos(const char *str, size_t ulen) {
sum += dig *1000000000;
break;
default:
return INT_MIN;
return 0;
break;
}
}
......
chr1 5 15 r1
chr1 7 12 r3
chr1 20 25 r2
##fileformat=VCFv4.1
19 252806 791255 G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 260365 791256 C <DEL> 33.71 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-680;END=261045;STR=+-:4;IMPRECISE;CIPOS=-1,257;CIEND=0,0;EVENT=791256;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=upstream_gene_variant|||ENSG00000271846|CTD-3113P16.9|ENST00000607399|||||processed_pseudogene
19 265134 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 265986 791258 A <DEL> 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
chr1 9 9 m3
chr1 50 150 m1
chr2 20 25 m2
##fileformat=VCFv4.1
19 256900 791255 G T 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 260800 791256 C <INS> 33.71 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=680;END=261045;STR=+-:4;IMPRECISE;CIPOS=-1,257;CIEND=0,0;EVENT=791256;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=upstream_gene_variant|||ENSG00000271846|CTD-3113P16.9|ENST00000607399|||||processed_pseudogene
19 265500 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 266003 791258 A C 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
......@@ -764,3 +764,53 @@ $BT intersect -a oneRecordNoNewline.bed -b oneRecordNoNewline.bed > obs
check obs exp
rm obs exp
###########################################################
# Test zero length intersections in non-bam files.
############################################################
echo " intersect.new.t65...\c"
echo \
"chr1 5 15 r1 chr1 9 9 m3 0
chr1 7 12 r3 chr1 9 9 m3 0" > exp
$BT intersect -a a_testZeroLen.bed -b b_testZeroLen.bed -wo > obs
check exp obs
rm exp obs
###########################################################
# Test zero length intersections in non-bam files, -sorted
############################################################
echo " intersect.new.t66...\c"
echo \
"chr1 5 15 r1 chr1 9 9 m3 0
chr1 7 12 r3 chr1 9 9 m3 0" > exp
$BT intersect -a a_testZeroLen.bed -b b_testZeroLen.bed -wo -sorted> obs
check exp obs
rm exp obs
###########################################################
# Test vcf struct var intersection
############################################################
echo " intersect.new.t67...\c"
echo \
"19 252806 791255 G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant|||||||||| 19 256900 791255 G T 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 260365 791256 C <DEL> 33.71 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-680;END=261045;STR=+-:4;IMPRECISE;CIPOS=-1,257;CIEND=0,0;EVENT=791256;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=upstream_gene_variant|||ENSG00000271846|CTD-3113P16.9|ENST00000607399|||||processed_pseudogene 19 260800 791256 C <INS> 33.71 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=680;END=261045;STR=+-:4;IMPRECISE;CIPOS=-1,257;CIEND=0,0;EVENT=791256;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=upstream_gene_variant|||ENSG00000271846|CTD-3113P16.9|ENST00000607399|||||processed_pseudogene
19 265134 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant|||||||||| 19 265500 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 265986 791258 A <DEL> 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant|||||||||| 19 265500 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 265986 791258 A <DEL> 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant|||||||||| 19 266003 791258 A C 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||" > exp
$BT intersect -a a_vcfSVtest.vcf -b b_vcfSVtest.vcf -wa -wb >obs
check exp obs
rm exp obs
###########################################################
# Test vcf struct var intersection, sorted
############################################################
echo " intersect.new.t68...\c"
echo \
"19 252806 791255 G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant|||||||||| 19 256900 791255 G T 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 260365 791256 C <DEL> 33.71 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-680;END=261045;STR=+-:4;IMPRECISE;CIPOS=-1,257;CIEND=0,0;EVENT=791256;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=upstream_gene_variant|||ENSG00000271846|CTD-3113P16.9|ENST00000607399|||||processed_pseudogene 19 260800 791256 C <INS> 33.71 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=680;END=261045;STR=+-:4;IMPRECISE;CIPOS=-1,257;CIEND=0,0;EVENT=791256;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=upstream_gene_variant|||ENSG00000271846|CTD-3113P16.9|ENST00000607399|||||processed_pseudogene
19 265134 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant|||||||||| 19 265500 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 265986 791258 A <DEL> 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant|||||||||| 19 265500 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 265986 791258 A <DEL> 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant|||||||||| 19 266003 791258 A C 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||" > exp
$BT intersect -a a_vcfSVtest.vcf -b b_vcfSVtest.vcf -wa -wb -sorted >obs
check exp obs
rm exp obs
chr1 8e02 830
......@@ -518,3 +518,27 @@ chr1 30 100" > exp
$BT merge -i a.bed -iobuf 8192 > obs
check exp obs
rm exp obs
###########################################################
# Test that scientific notation is allowed for coordinates
###########################################################
echo " merge.t43...\c"
echo \
"chr1 800 830" > exp
$BT merge -i expFormat.bed > obs
check exp obs
rm obs exp
###########################################################
# Test that struct vars in VCF get correct length
###########################################################
echo " merge.t44...\c"
echo \
"19 252805 257416
19 260364 261044
19 265133 265691
19 265985 266386" > exp
$BT merge -i vcfSVtest.vcf > obs
check exp obs
rm obs exp
##fileformat=VCFv4.1
19 252806 791255 G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 260365 791256 C <DEL> 33.71 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-680;END=261045;STR=+-:4;IMPRECISE;CIPOS=-1,257;CIEND=0,0;EVENT=791256;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=upstream_gene_variant|||ENSG00000271846|CTD-3113P16.9|ENST00000607399|||||processed_pseudogene
19 265134 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 265986 791258 A <DEL> 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment