Skip to content
Snippets Groups Projects
Commit fab56175 authored by Piotr Gawron's avatar Piotr Gawron
Browse files

parsing of drugbank targets improved

when target didn't have references some targets might get skipped
parent 6424d98b
No related branches found
No related tags found
1 merge request!97Resolve "reverse search for drug interaction"
......@@ -300,7 +300,7 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi
List<Target> getTargetsForDrug(String page) throws DrugSearchException {
List<Target> result = new ArrayList<>();
try {
int i, j;
int i;
Target target = new Target();
target.setType(TargetType.SINGLE_PROTEIN);
......@@ -316,63 +316,18 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi
i = page.indexOf("Details</a></div>", i);
while (i > 0 && i < end) {
int kindIndex = page.indexOf("<dt>Kind</dt><dd>", i);
kindIndex += "<dt>Kind</dt><dd>".length();
int endKindIndex = page.indexOf("</dd>", kindIndex);
String type = "";
if (kindIndex > 0 && endKindIndex > kindIndex) {
type = page.substring(kindIndex, endKindIndex);
int targetStart = page.indexOf("</div><strong>", i);
int nextTargetStart = page.indexOf("</div><strong>", targetStart + 1);
if (nextTargetStart < 0) {
nextTargetStart = end;
}
if (type.trim().equalsIgnoreCase("Protein")) {
i = page.indexOf("/biodb/polypeptides/", i);
i += "/biodb/polypeptides/".length();
target = new Target();
target.setType(TargetType.SINGLE_PROTEIN);
// Getting ID && Name
j = page.indexOf('"', i);
String uniprotId = page.substring(i, j);
MiriamData uniprotTarget = new MiriamData(MiriamType.UNIPROT, uniprotId);
MiriamData hgncTarget = uniprotAnnotator.uniProtToHgnc(uniprotTarget);
if (hgncTarget != null) {
target.addGene(hgncTarget);
} else {
target.addGene(uniprotTarget);
}
i = j + 2;
j = page.indexOf("</", i);
String name = StringEscapeUtils.unescapeHtml4(page.substring(i, j));
target.setName(name);
// Getting Organism
i = page.indexOf("Organism</dt><dd>", i);
i = i + "Organism</dt><dd>".length();
j = page.indexOf("</dd>", i);
target.setOrganism(getTaxonomyBackend().getByName(page.substring(i, j)));
// Getting References
i = page.indexOf("<strong>References</strong>", i);
if (i > 0 && i < end) {
int nextI = Math.min(page.indexOf("Details</a></div>", i), end);
if (nextI < 0) {
nextI = end;
}
target.addReferences(getPubmedFromRef(page.substring(i, nextI)));
i = nextI;
} else {
i = end;
}
target = parseTarget(page.substring(targetStart, nextTargetStart));
if (target != null) {
result.add(target);
} else {
i = kindIndex;
logger.warn("Unknown target type: " + type + ". Skipping.");
int nextI = Math.min(page.indexOf("Details</a></div>", i), end);
if (nextI < 0) {
nextI = end;
}
i = nextI;
}
i = nextTargetStart;
}
} catch (TaxonomySearchException e) {
throw new DrugSearchException("Problem with finidng information about organism", e);
......@@ -382,6 +337,67 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi
return result;
}
/**
* Parse html info about target into {@link Target} structure.
*
* @param htmlPage
* string with html content
* @return {@link Target} for given html content
* @throws UniprotSearchException
* thrown when there is a problem with accessing uniprot db
* @throws TaxonomySearchException
* thrown when there is a problem with accessing taxonomy db
*/
protected Target parseTarget(String htmlPage) throws UniprotSearchException, TaxonomySearchException {
int kindIndex = htmlPage.indexOf("<dt>Kind</dt><dd>");
kindIndex += "<dt>Kind</dt><dd>".length();
int endKindIndex = htmlPage.indexOf("</dd>");
String type = "";
if (kindIndex > 0 && endKindIndex > kindIndex) {
type = htmlPage.substring(kindIndex, endKindIndex);
}
if (type.trim().equalsIgnoreCase("Protein")) {
int uniprotIdStart = htmlPage.indexOf("/biodb/polypeptides/") + "/biodb/polypeptides/".length();
Target result = new Target();
result.setType(TargetType.SINGLE_PROTEIN);
// Getting ID && Name
int uniprotIdEnd = htmlPage.indexOf('"', uniprotIdStart);
String uniprotId = htmlPage.substring(uniprotIdStart, uniprotIdEnd);
MiriamData uniprotTarget = new MiriamData(MiriamType.UNIPROT, uniprotId);
MiriamData hgncTarget = uniprotAnnotator.uniProtToHgnc(uniprotTarget);
if (hgncTarget != null) {
result.addGene(hgncTarget);
} else {
result.addGene(uniprotTarget);
}
int nameStart = uniprotIdEnd + 2;
int nameEnd = htmlPage.indexOf("</", uniprotIdStart);
String name = StringEscapeUtils.unescapeHtml4(htmlPage.substring(nameStart, nameEnd));
result.setName(name);
// Getting Organism
int organismStart = htmlPage.indexOf("Organism</dt><dd>", nameEnd) + "Organism</dt><dd>".length();
int organismEnd = htmlPage.indexOf("</dd>", organismStart);
result.setOrganism(getTaxonomyBackend().getByName(htmlPage.substring(organismStart, organismEnd)));
// Getting References
int referencesStart = htmlPage.indexOf("<strong>References</strong>", organismEnd);
if (referencesStart > 0) {
int referencesEnd = Math.min(htmlPage.indexOf("Details</a></div>", referencesStart), htmlPage.length());
if (referencesEnd < 0) {
referencesEnd = htmlPage.length();
}
result.addReferences(getPubmedFromRef(htmlPage.substring(referencesStart, referencesEnd)));
}
return result;
} else {
logger.warn("Unknown target type: " + type + ". Skipping.");
return null;
}
}
/**
* Finds information about drug in drugbank database.
*
......
......@@ -711,4 +711,13 @@ public class DrugbankHTMLParserTest extends AnnotationTestFunctions {
assertEquals(uniprotAnnotator, parser.getUniprotAnnotator());
}
@Test
public void parseTarget() throws Exception {
String content = super.readFile("testFiles/drugbank/target-html-part.html");
Target target = drugBankHTMLParser.parseTarget(content);
assertNotNull(target);
assertEquals(2, target.getReferences().size());
assertEquals("Galactoside O-acetyltransferase", target.getName());
}
}
</div><strong>1. <a href="/biodb/polypeptides/P07464">Galactoside O-acetyltransferase</a></strong></div><div class="panel-body"><dl class="dl-horizontal"><dt>Kind</dt><dd>Protein</dd><dt>Organism</dt><dd>Escherichia coli (strain K12)</dd><dt>Pharmacological action</dt><dd><strong class="label label-warning">unknown</strong></dd></dl><dl class="dl-horizontal"><dt>General Function:</dt><dd>Galactoside o-acetyltransferase activity</dd><dt>Specific Function:</dt><dd>May assist cellular detoxification by acetylating non-metabolizable pyranosides, thereby preventing their reentry into the cell.</dd><dt>Gene Name:</dt><dd>lacA</dd><dt>Uniprot ID:</dt><dd><a target="_blank" class="wishart-link-out" href="http://www.uniprot.org/uniprot/P07464">P07464 <span class="glyphicon glyphicon-new-window"> </span></a></dd><dt>Uniprot Name:</dt><dd>Galactoside O-acetyltransferase</dd><dt>Molecular Weight:</dt><dd>22798.89 Da</dd></dl><h5><strong>References</strong></h5><blockquote class="references"><ol class="cite-this-references"><li id="reference-A1713">Overington JP, Al-Lazikani B, Hopkins AL: How many drug targets are there? Nat Rev Drug Discov. 2006 Dec;5(12):993-6. [<a target="_blank" class="wishart-link-out" href="http://www.ncbi.nlm.nih.gov/pubmed/17139284">PubMed:17139284 <span class="glyphicon glyphicon-new-window"> </span></a>] </li><li id="reference-A1715">Imming P, Sinning C, Meyer A: Drugs, their targets and the nature and number of drug targets. Nat Rev Drug Discov. 2006 Oct;5(10):821-34. [<a target="_blank" class="wishart-link-out" href="http://www.ncbi.nlm.nih.gov/pubmed/17016423">PubMed:17016423 <span class="glyphicon glyphicon-new-window"> </span></a>] </li></ol></blockquote></div></div><div class="panel panel-default" id="BE0000574"><div class="panel-heading"><div class="pull-right"><a class="btn btn-primary btn-xs" href="/biodb/polypeptides/P04035"><span class="glyphicon glyphicon-list"> </span> Details</a>
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment