From fab561751530ed01d7db843250d88941e3663152 Mon Sep 17 00:00:00 2001 From: Piotr Gawron <piotr.gawron@uni.lu> Date: Mon, 31 Jul 2017 15:17:57 +0200 Subject: [PATCH] parsing of drugbank targets improved when target didn't have references some targets might get skipped --- .../services/DrugbankHTMLParser.java | 126 ++++++++++-------- .../services/DrugbankHTMLParserTest.java | 9 ++ .../testFiles/drugbank/target-html-part.html | 1 + 3 files changed, 81 insertions(+), 55 deletions(-) create mode 100644 annotation/testFiles/drugbank/target-html-part.html diff --git a/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java b/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java index fcdf9c0225..537492673d 100644 --- a/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java +++ b/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java @@ -300,7 +300,7 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi List<Target> getTargetsForDrug(String page) throws DrugSearchException { List<Target> result = new ArrayList<>(); try { - int i, j; + int i; Target target = new Target(); target.setType(TargetType.SINGLE_PROTEIN); @@ -316,63 +316,18 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi i = page.indexOf("Details</a></div>", i); while (i > 0 && i < end) { - int kindIndex = page.indexOf("<dt>Kind</dt><dd>", i); - kindIndex += "<dt>Kind</dt><dd>".length(); - int endKindIndex = page.indexOf("</dd>", kindIndex); - String type = ""; - if (kindIndex > 0 && endKindIndex > kindIndex) { - type = page.substring(kindIndex, endKindIndex); + int targetStart = page.indexOf("</div><strong>", i); + int nextTargetStart = page.indexOf("</div><strong>", targetStart + 1); + if (nextTargetStart < 0) { + nextTargetStart = end; } - if (type.trim().equalsIgnoreCase("Protein")) { - i = page.indexOf("/biodb/polypeptides/", i); - i += "/biodb/polypeptides/".length(); - target = new Target(); - target.setType(TargetType.SINGLE_PROTEIN); - - // Getting ID && Name - j = page.indexOf('"', i); - String uniprotId = page.substring(i, j); - MiriamData uniprotTarget = new MiriamData(MiriamType.UNIPROT, uniprotId); - MiriamData hgncTarget = uniprotAnnotator.uniProtToHgnc(uniprotTarget); - if (hgncTarget != null) { - target.addGene(hgncTarget); - } else { - target.addGene(uniprotTarget); - } - - i = j + 2; - j = page.indexOf("</", i); - String name = StringEscapeUtils.unescapeHtml4(page.substring(i, j)); - target.setName(name); - - // Getting Organism - i = page.indexOf("Organism</dt><dd>", i); - i = i + "Organism</dt><dd>".length(); - j = page.indexOf("</dd>", i); - target.setOrganism(getTaxonomyBackend().getByName(page.substring(i, j))); - - // Getting References - i = page.indexOf("<strong>References</strong>", i); - if (i > 0 && i < end) { - int nextI = Math.min(page.indexOf("Details</a></div>", i), end); - if (nextI < 0) { - nextI = end; - } - target.addReferences(getPubmedFromRef(page.substring(i, nextI))); - i = nextI; - } else { - i = end; - } + + target = parseTarget(page.substring(targetStart, nextTargetStart)); + if (target != null) { result.add(target); - } else { - i = kindIndex; - logger.warn("Unknown target type: " + type + ". Skipping."); - int nextI = Math.min(page.indexOf("Details</a></div>", i), end); - if (nextI < 0) { - nextI = end; - } - i = nextI; } + i = nextTargetStart; + } } catch (TaxonomySearchException e) { throw new DrugSearchException("Problem with finidng information about organism", e); @@ -382,6 +337,67 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi return result; } + /** + * Parse html info about target into {@link Target} structure. + * + * @param htmlPage + * string with html content + * @return {@link Target} for given html content + * @throws UniprotSearchException + * thrown when there is a problem with accessing uniprot db + * @throws TaxonomySearchException + * thrown when there is a problem with accessing taxonomy db + */ + protected Target parseTarget(String htmlPage) throws UniprotSearchException, TaxonomySearchException { + int kindIndex = htmlPage.indexOf("<dt>Kind</dt><dd>"); + kindIndex += "<dt>Kind</dt><dd>".length(); + int endKindIndex = htmlPage.indexOf("</dd>"); + String type = ""; + if (kindIndex > 0 && endKindIndex > kindIndex) { + type = htmlPage.substring(kindIndex, endKindIndex); + } + if (type.trim().equalsIgnoreCase("Protein")) { + int uniprotIdStart = htmlPage.indexOf("/biodb/polypeptides/") + "/biodb/polypeptides/".length(); + Target result = new Target(); + result.setType(TargetType.SINGLE_PROTEIN); + + // Getting ID && Name + int uniprotIdEnd = htmlPage.indexOf('"', uniprotIdStart); + String uniprotId = htmlPage.substring(uniprotIdStart, uniprotIdEnd); + MiriamData uniprotTarget = new MiriamData(MiriamType.UNIPROT, uniprotId); + MiriamData hgncTarget = uniprotAnnotator.uniProtToHgnc(uniprotTarget); + if (hgncTarget != null) { + result.addGene(hgncTarget); + } else { + result.addGene(uniprotTarget); + } + + int nameStart = uniprotIdEnd + 2; + int nameEnd = htmlPage.indexOf("</", uniprotIdStart); + String name = StringEscapeUtils.unescapeHtml4(htmlPage.substring(nameStart, nameEnd)); + result.setName(name); + + // Getting Organism + int organismStart = htmlPage.indexOf("Organism</dt><dd>", nameEnd) + "Organism</dt><dd>".length(); + int organismEnd = htmlPage.indexOf("</dd>", organismStart); + result.setOrganism(getTaxonomyBackend().getByName(htmlPage.substring(organismStart, organismEnd))); + + // Getting References + int referencesStart = htmlPage.indexOf("<strong>References</strong>", organismEnd); + if (referencesStart > 0) { + int referencesEnd = Math.min(htmlPage.indexOf("Details</a></div>", referencesStart), htmlPage.length()); + if (referencesEnd < 0) { + referencesEnd = htmlPage.length(); + } + result.addReferences(getPubmedFromRef(htmlPage.substring(referencesStart, referencesEnd))); + } + return result; + } else { + logger.warn("Unknown target type: " + type + ". Skipping."); + return null; + } + } + /** * Finds information about drug in drugbank database. * diff --git a/annotation/src/test/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParserTest.java b/annotation/src/test/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParserTest.java index 5b0dafb3ff..3e53239be1 100644 --- a/annotation/src/test/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParserTest.java +++ b/annotation/src/test/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParserTest.java @@ -711,4 +711,13 @@ public class DrugbankHTMLParserTest extends AnnotationTestFunctions { assertEquals(uniprotAnnotator, parser.getUniprotAnnotator()); } + @Test + public void parseTarget() throws Exception { + String content = super.readFile("testFiles/drugbank/target-html-part.html"); + Target target = drugBankHTMLParser.parseTarget(content); + assertNotNull(target); + assertEquals(2, target.getReferences().size()); + assertEquals("Galactoside O-acetyltransferase", target.getName()); + } + } diff --git a/annotation/testFiles/drugbank/target-html-part.html b/annotation/testFiles/drugbank/target-html-part.html new file mode 100644 index 0000000000..904b85d584 --- /dev/null +++ b/annotation/testFiles/drugbank/target-html-part.html @@ -0,0 +1 @@ +</div><strong>1. <a href="/biodb/polypeptides/P07464">Galactoside O-acetyltransferase</a></strong></div><div class="panel-body"><dl class="dl-horizontal"><dt>Kind</dt><dd>Protein</dd><dt>Organism</dt><dd>Escherichia coli (strain K12)</dd><dt>Pharmacological action</dt><dd><strong class="label label-warning">unknown</strong></dd></dl><dl class="dl-horizontal"><dt>General Function:</dt><dd>Galactoside o-acetyltransferase activity</dd><dt>Specific Function:</dt><dd>May assist cellular detoxification by acetylating non-metabolizable pyranosides, thereby preventing their reentry into the cell.</dd><dt>Gene Name:</dt><dd>lacA</dd><dt>Uniprot ID:</dt><dd><a target="_blank" class="wishart-link-out" href="http://www.uniprot.org/uniprot/P07464">P07464 <span class="glyphicon glyphicon-new-window"> </span></a></dd><dt>Uniprot Name:</dt><dd>Galactoside O-acetyltransferase</dd><dt>Molecular Weight:</dt><dd>22798.89 Da</dd></dl><h5><strong>References</strong></h5><blockquote class="references"><ol class="cite-this-references"><li id="reference-A1713">Overington JP, Al-Lazikani B, Hopkins AL: How many drug targets are there? Nat Rev Drug Discov. 2006 Dec;5(12):993-6. [<a target="_blank" class="wishart-link-out" href="http://www.ncbi.nlm.nih.gov/pubmed/17139284">PubMed:17139284 <span class="glyphicon glyphicon-new-window"> </span></a>] </li><li id="reference-A1715">Imming P, Sinning C, Meyer A: Drugs, their targets and the nature and number of drug targets. Nat Rev Drug Discov. 2006 Oct;5(10):821-34. [<a target="_blank" class="wishart-link-out" href="http://www.ncbi.nlm.nih.gov/pubmed/17016423">PubMed:17016423 <span class="glyphicon glyphicon-new-window"> </span></a>] </li></ol></blockquote></div></div><div class="panel panel-default" id="BE0000574"><div class="panel-heading"><div class="pull-right"><a class="btn btn-primary btn-xs" href="/biodb/polypeptides/P04035"><span class="glyphicon glyphicon-list"> </span> Details</a> \ No newline at end of file -- GitLab