From fab561751530ed01d7db843250d88941e3663152 Mon Sep 17 00:00:00 2001
From: Piotr Gawron <piotr.gawron@uni.lu>
Date: Mon, 31 Jul 2017 15:17:57 +0200
Subject: [PATCH] parsing of drugbank targets improved

when target didn't have references some targets might get skipped
---
 .../services/DrugbankHTMLParser.java          | 126 ++++++++++--------
 .../services/DrugbankHTMLParserTest.java      |   9 ++
 .../testFiles/drugbank/target-html-part.html  |   1 +
 3 files changed, 81 insertions(+), 55 deletions(-)
 create mode 100644 annotation/testFiles/drugbank/target-html-part.html

diff --git a/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java b/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java
index fcdf9c0225..537492673d 100644
--- a/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java
+++ b/annotation/src/main/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParser.java
@@ -300,7 +300,7 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi
 	List<Target> getTargetsForDrug(String page) throws DrugSearchException {
 		List<Target> result = new ArrayList<>();
 		try {
-			int i, j;
+			int i;
 			Target target = new Target();
 			target.setType(TargetType.SINGLE_PROTEIN);
 
@@ -316,63 +316,18 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi
 
 			i = page.indexOf("Details</a></div>", i);
 			while (i > 0 && i < end) {
-				int kindIndex = page.indexOf("<dt>Kind</dt><dd>", i);
-				kindIndex += "<dt>Kind</dt><dd>".length();
-				int endKindIndex = page.indexOf("</dd>", kindIndex);
-				String type = "";
-				if (kindIndex > 0 && endKindIndex > kindIndex) {
-					type = page.substring(kindIndex, endKindIndex);
+				int targetStart = page.indexOf("</div><strong>", i);
+				int nextTargetStart = page.indexOf("</div><strong>", targetStart + 1);
+				if (nextTargetStart < 0) {
+					nextTargetStart = end;
 				}
-				if (type.trim().equalsIgnoreCase("Protein")) {
-					i = page.indexOf("/biodb/polypeptides/", i);
-					i += "/biodb/polypeptides/".length();
-					target = new Target();
-					target.setType(TargetType.SINGLE_PROTEIN);
-
-					// Getting ID && Name
-					j = page.indexOf('"', i);
-					String uniprotId = page.substring(i, j);
-					MiriamData uniprotTarget = new MiriamData(MiriamType.UNIPROT, uniprotId);
-					MiriamData hgncTarget = uniprotAnnotator.uniProtToHgnc(uniprotTarget);
-					if (hgncTarget != null) {
-						target.addGene(hgncTarget);
-					} else {
-						target.addGene(uniprotTarget);
-					}
-
-					i = j + 2;
-					j = page.indexOf("</", i);
-					String name = StringEscapeUtils.unescapeHtml4(page.substring(i, j));
-					target.setName(name);
-
-					// Getting Organism
-					i = page.indexOf("Organism</dt><dd>", i);
-					i = i + "Organism</dt><dd>".length();
-					j = page.indexOf("</dd>", i);
-					target.setOrganism(getTaxonomyBackend().getByName(page.substring(i, j)));
-
-					// Getting References
-					i = page.indexOf("<strong>References</strong>", i);
-					if (i > 0 && i < end) {
-						int nextI = Math.min(page.indexOf("Details</a></div>", i), end);
-						if (nextI < 0) {
-							nextI = end;
-						}
-						target.addReferences(getPubmedFromRef(page.substring(i, nextI)));
-						i = nextI;
-					} else {
-						i = end;
-					}
+
+				target = parseTarget(page.substring(targetStart, nextTargetStart));
+				if (target != null) {
 					result.add(target);
-				} else {
-					i = kindIndex;
-					logger.warn("Unknown target type: " + type + ". Skipping.");
-					int nextI = Math.min(page.indexOf("Details</a></div>", i), end);
-					if (nextI < 0) {
-						nextI = end;
-					}
-					i = nextI;
 				}
+				i = nextTargetStart;
+
 			}
 		} catch (TaxonomySearchException e) {
 			throw new DrugSearchException("Problem with finidng information about organism", e);
@@ -382,6 +337,67 @@ public class DrugbankHTMLParser extends DrugAnnotation implements IExternalServi
 		return result;
 	}
 
+	/**
+	 * Parse html info about target into {@link Target} structure.
+	 * 
+	 * @param htmlPage
+	 *          string with html content
+	 * @return {@link Target} for given html content
+	 * @throws UniprotSearchException
+	 *           thrown when there is a problem with accessing uniprot db
+	 * @throws TaxonomySearchException
+	 *           thrown when there is a problem with accessing taxonomy db
+	 */
+	protected Target parseTarget(String htmlPage) throws UniprotSearchException, TaxonomySearchException {
+		int kindIndex = htmlPage.indexOf("<dt>Kind</dt><dd>");
+		kindIndex += "<dt>Kind</dt><dd>".length();
+		int endKindIndex = htmlPage.indexOf("</dd>");
+		String type = "";
+		if (kindIndex > 0 && endKindIndex > kindIndex) {
+			type = htmlPage.substring(kindIndex, endKindIndex);
+		}
+		if (type.trim().equalsIgnoreCase("Protein")) {
+			int uniprotIdStart = htmlPage.indexOf("/biodb/polypeptides/") + "/biodb/polypeptides/".length();
+			Target result = new Target();
+			result.setType(TargetType.SINGLE_PROTEIN);
+
+			// Getting ID && Name
+			int uniprotIdEnd = htmlPage.indexOf('"', uniprotIdStart);
+			String uniprotId = htmlPage.substring(uniprotIdStart, uniprotIdEnd);
+			MiriamData uniprotTarget = new MiriamData(MiriamType.UNIPROT, uniprotId);
+			MiriamData hgncTarget = uniprotAnnotator.uniProtToHgnc(uniprotTarget);
+			if (hgncTarget != null) {
+				result.addGene(hgncTarget);
+			} else {
+				result.addGene(uniprotTarget);
+			}
+
+			int nameStart = uniprotIdEnd + 2;
+			int nameEnd = htmlPage.indexOf("</", uniprotIdStart);
+			String name = StringEscapeUtils.unescapeHtml4(htmlPage.substring(nameStart, nameEnd));
+			result.setName(name);
+
+			// Getting Organism
+			int organismStart = htmlPage.indexOf("Organism</dt><dd>", nameEnd) + "Organism</dt><dd>".length();
+			int organismEnd = htmlPage.indexOf("</dd>", organismStart);
+			result.setOrganism(getTaxonomyBackend().getByName(htmlPage.substring(organismStart, organismEnd)));
+
+			// Getting References
+			int referencesStart = htmlPage.indexOf("<strong>References</strong>", organismEnd);
+			if (referencesStart > 0) {
+				int referencesEnd = Math.min(htmlPage.indexOf("Details</a></div>", referencesStart), htmlPage.length());
+				if (referencesEnd < 0) {
+					referencesEnd = htmlPage.length();
+				}
+				result.addReferences(getPubmedFromRef(htmlPage.substring(referencesStart, referencesEnd)));
+			}
+			return result;
+		} else {
+			logger.warn("Unknown target type: " + type + ". Skipping.");
+			return null;
+		}
+	}
+
 	/**
 	 * Finds information about drug in drugbank database.
 	 * 
diff --git a/annotation/src/test/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParserTest.java b/annotation/src/test/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParserTest.java
index 5b0dafb3ff..3e53239be1 100644
--- a/annotation/src/test/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParserTest.java
+++ b/annotation/src/test/java/lcsb/mapviewer/annotation/services/DrugbankHTMLParserTest.java
@@ -711,4 +711,13 @@ public class DrugbankHTMLParserTest extends AnnotationTestFunctions {
 		assertEquals(uniprotAnnotator, parser.getUniprotAnnotator());
 	}
 
+	@Test
+	public void parseTarget() throws Exception {
+		String content = super.readFile("testFiles/drugbank/target-html-part.html");
+		Target target = drugBankHTMLParser.parseTarget(content);
+		assertNotNull(target);
+		assertEquals(2, target.getReferences().size());
+		assertEquals("Galactoside O-acetyltransferase", target.getName());
+	}
+	
 }
diff --git a/annotation/testFiles/drugbank/target-html-part.html b/annotation/testFiles/drugbank/target-html-part.html
new file mode 100644
index 0000000000..904b85d584
--- /dev/null
+++ b/annotation/testFiles/drugbank/target-html-part.html
@@ -0,0 +1 @@
+</div><strong>1. <a href="/biodb/polypeptides/P07464">Galactoside O-acetyltransferase</a></strong></div><div class="panel-body"><dl class="dl-horizontal"><dt>Kind</dt><dd>Protein</dd><dt>Organism</dt><dd>Escherichia coli (strain K12)</dd><dt>Pharmacological action</dt><dd><strong class="label label-warning">unknown</strong></dd></dl><dl class="dl-horizontal"><dt>General Function:</dt><dd>Galactoside o-acetyltransferase activity</dd><dt>Specific Function:</dt><dd>May assist cellular detoxification by acetylating non-metabolizable pyranosides, thereby preventing their reentry into the cell.</dd><dt>Gene Name:</dt><dd>lacA</dd><dt>Uniprot ID:</dt><dd><a target="_blank" class="wishart-link-out" href="http://www.uniprot.org/uniprot/P07464">P07464 <span class="glyphicon glyphicon-new-window"> </span></a></dd><dt>Uniprot Name:</dt><dd>Galactoside O-acetyltransferase</dd><dt>Molecular Weight:</dt><dd>22798.89 Da</dd></dl><h5><strong>References</strong></h5><blockquote class="references"><ol class="cite-this-references"><li id="reference-A1713">Overington JP, Al-Lazikani B, Hopkins AL: How many drug targets are there? Nat Rev Drug Discov. 2006 Dec;5(12):993-6. [<a target="_blank" class="wishart-link-out" href="http://www.ncbi.nlm.nih.gov/pubmed/17139284">PubMed:17139284 <span class="glyphicon glyphicon-new-window"> </span></a>] </li><li id="reference-A1715">Imming P, Sinning C, Meyer A: Drugs, their targets and the nature and number of drug targets. Nat Rev Drug Discov. 2006 Oct;5(10):821-34. [<a target="_blank" class="wishart-link-out" href="http://www.ncbi.nlm.nih.gov/pubmed/17016423">PubMed:17016423 <span class="glyphicon glyphicon-new-window"> </span></a>] </li></ol></blockquote></div></div><div class="panel panel-default" id="BE0000574"><div class="panel-heading"><div class="pull-right"><a class="btn btn-primary btn-xs" href="/biodb/polypeptides/P04035"><span class="glyphicon glyphicon-list"> </span> Details</a>
\ No newline at end of file
-- 
GitLab