Skip to content
Snippets Groups Projects
Commit 9b8e84ba authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Merge pull request #245 from nkindlon/master

Addressed >2 numbers in VCF SVLEN, can appear at endl w/ tab, semi-co…
parents 33f2bef8 9724fbbe
No related branches found
No related tags found
No related merge requests found
...@@ -139,6 +139,11 @@ bool SingleLineDelimTextFileReader::findDelimiters() { ...@@ -139,6 +139,11 @@ bool SingleLineDelimTextFileReader::findDelimiters() {
} }
int SingleLineDelimTextFileReader::getVcfSVlen() { int SingleLineDelimTextFileReader::getVcfSVlen() {
// The SVLEN field can appear anywhere in the info tag, and may be followed by a semi-colon, tab, newline, or end in NULL.
// it can contain one, two, or more numbers, which would be separated by a comma.
// We want the minimum number.
int startPos = _delimPositions[VCF_TAG_FIELD] +1; int startPos = _delimPositions[VCF_TAG_FIELD] +1;
const char *startPtr = strstr(_sLine.c_str() + startPos, "SVLEN="); const char *startPtr = strstr(_sLine.c_str() + startPos, "SVLEN=");
if (startPtr == NULL) { if (startPtr == NULL) {
...@@ -146,16 +151,30 @@ int SingleLineDelimTextFileReader::getVcfSVlen() { ...@@ -146,16 +151,30 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
return 1; return 1;
} }
startPtr +=6; // length of label "SVLEN=" startPtr +=6; // length of label "SVLEN="
const char *endPtr = strchr(startPtr, ';'); const char *currPtr = startPtr;
const char *midPtr = strchr(startPtr, ','); const char *endPtr = _sLine.c_str() + _sLine.size();
int endCoord = -1;
if (midPtr != NULL && midPtr < endPtr) { int minVal = INT_MAX;
//comma found in the number, that means there are two numbers int currVal = 0;
int num1 = str2chrPos(startPtr, midPtr - startPtr); QuickString currValStr;
int num2 = str2chrPos(midPtr +1, endPtr - (midPtr +1)); while (1) {
endCoord = min(abs(num1), abs(num2)); if (currPtr == endPtr || *currPtr == ';' || *currPtr == '\t' || *currPtr == '\n' || *currPtr == ',') {
} else { if (currPtr > startPtr) {
endCoord = abs(str2chrPos(startPtr, endPtr - startPtr)); currValStr.assign(startPtr, currPtr - startPtr);
} currVal = abs(str2chrPos(currValStr));
return endCoord; if (currVal < minVal) minVal = currVal;
startPtr = currPtr;
}
if (currPtr == endPtr || *currPtr != ',') {
//if end of line or non-comma delimiter, break.
break;
} else {
//skip the comma
startPtr++;
}
}
currPtr++;
};
return minVal;
} }
##fileformat=VCFv4.1
chr1 1 a G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
chr1 4 a G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||;SVLEN=-45,800,2
\ No newline at end of file
##fileformat=VCFv4.1
chr1 1 a G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
chr1 4 a G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||;SVLEN=-10
\ No newline at end of file
...@@ -582,6 +582,49 @@ $BT intersect -a bug223_sv1_a.vcf -b bug223_sv1_b.vcf | cut -f1-6 > obs ...@@ -582,6 +582,49 @@ $BT intersect -a bug223_sv1_a.vcf -b bug223_sv1_b.vcf | cut -f1-6 > obs
check exp obs check exp obs
rm exp obs rm exp obs
##################################################################
# see that SVLEN in VCF files can handle multiple numbers,
# at end of line, followed by NULL.
##################################################################
echo " intersect.t48...\c"
echo \
"chr1 1 a G <DEL> 70.90
chr1 1 a G <DEL> 70.90
chr1 4 a G <DEL> 70.90
chr1 4 a G <DEL> 70.90" > exp
$BT intersect -a bug223_d.vcf -b bug223_d.vcf | cut -f1-6 > obs
check exp obs
rm exp obs
##################################################################
# see that SVLEN in VCF files can handle multiple numbers,
# at end of line, followed by a tab
##################################################################
echo " intersect.t49...\c"
echo \
"chr1 1 a G <DEL> 70.90
chr1 1 a G <DEL> 70.90
chr1 4 a G <DEL> 70.90
chr1 4 a G <DEL> 70.90" > exp
$BT intersect -a bug223_e.vcf -b bug223_e.vcf | cut -f1-6 > obs
check exp obs
rm exp obs
##################################################################
# see that SVLEN in VCF files can handle single numbers,
# at end of line, followed by null
##################################################################
echo " intersect.t50...\c"
echo \
"chr1 1 a G <DEL> 70.90
chr1 1 a G <DEL> 70.90
chr1 4 a G <DEL> 70.90
chr1 4 a G <DEL> 70.90" > exp
$BT intersect -a bug223_f.vcf -b bug223_f.vcf | cut -f1-6 > obs
check exp obs
rm exp obs
cd multi_intersect cd multi_intersect
bash test-multi_intersect.sh bash test-multi_intersect.sh
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment