anHALytics · lfoppiano · Aug 24, 2020 · Aug 25, 2020 · Aug 25, 2020
diff --git a/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/HalTEIConverter.java b/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/HalTEIConverter.java
@@ -204,19 +204,21 @@ private Element createMetadataTEIHeader(NodeList stuffToTake, Document doc) {
     private void parseOrgsAddress(Document doc, NodeList orgs) {
         Node org = null;
         GrobidService gs = new GrobidService();
+        DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory
+                .newInstance();
         for (int i = orgs.getLength() - 1; i >= 0; i--) {
             org = orgs.item(i);
             if (org.getNodeType() == Node.ELEMENT_NODE) {
                 Element orgElt = (Element) orgs.item(i);
                 NodeList addressNodes = orgElt.getElementsByTagName("addrLine");
 
                 NodeList orgNameNodes = orgElt.getElementsByTagName("orgName");
-                String orgNameStr = "";
+                StringBuilder orgNameStr = new StringBuilder();
                 Node orgNameNode = null;
                 for (int y = orgNameNodes.getLength() - 1; y >= 0; y--) {
                     orgNameNode = orgNameNodes.item(y);
                     if (orgNameNode.getNodeType() == Node.ELEMENT_NODE) {
-                        orgNameStr += !orgNameStr.isEmpty() ? " "+orgNameNode.getTextContent():orgNameNode.getTextContent();
+                        orgNameStr.append((orgNameStr.length() > 0) ? " " + orgNameNode.getTextContent() : orgNameNode.getTextContent());
                     }
                 }
 
@@ -231,8 +233,7 @@ private void parseOrgsAddress(Document doc, NodeList orgs) {
                     if (addrLine != null && isNotBlank(addrLine.getTextContent())) {
                         grobidResponse = gs.processAffiliation(orgNameStr + " " + addrLine.getTextContent() + " " + countryCode);
                         try {
-                            Element node = DocumentBuilderFactory
-                                    .newInstance()
+                            Element node = documentBuilderFactory
                                     .newDocumentBuilder()
                                     .parse(new ByteArrayInputStream(grobidResponse.getBytes()))
                                     .getDocumentElement();

diff --git a/...ytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/IstexTEIConverter.java b/...ytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/IstexTEIConverter.java
@@ -112,6 +112,8 @@ private void updateKeywords(Document metadata) {
     private void parseAffiliationString(Document doc, NodeList affs) {
         Node aff = null;
         GrobidService gs = new GrobidService();
+        DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory
+                .newInstance();
         for (int i = affs.getLength() - 1; i >= 0; i--) {
             aff = affs.item(i);
             if (aff.getNodeType() == Node.ELEMENT_NODE) {
@@ -125,8 +127,7 @@ private void parseAffiliationString(Document doc, NodeList affs) {
                     try {
                         // (HACK)Grobid may split affiliation string into two affiliation elements, which is considered not well-formed.
                         grobidResponse = "<wrap>" + grobidResponse + "</wrap>";
-                        Element node = DocumentBuilderFactory
-                                .newInstance()
+                        Element node = documentBuilderFactory
                                 .newDocumentBuilder()
                                 .parse(new ByteArrayInputStream(grobidResponse.getBytes()))
                                 .getDocumentElement();

diff --git a/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/crossref/CrossRef.java b/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/crossref/CrossRef.java
@@ -1,25 +1,26 @@
 package fr.inria.anhalytics.harvest.crossref;
 
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import fr.inria.anhalytics.commons.exceptions.ServiceException;
 import fr.inria.anhalytics.commons.exceptions.SystemException;
 import fr.inria.anhalytics.commons.managers.MongoFileManager;
 import fr.inria.anhalytics.commons.properties.HarvestProperties;
-import com.fasterxml.jackson.databind.*;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NodeList;
+
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathFactory;
-
-import java.io.*;
-
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.net.HttpURLConnection;
 import java.net.URL;
 
@@ -65,26 +66,16 @@ public class CrossRef {
 
     private MongoFileManager mm;
 
-    private DocumentBuilder docBuilder;
-
     public CrossRef() {
         this.mm = MongoFileManager.getInstance(false);
-
-        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
-        docFactory.setValidating(false);
-        //docFactory.setNamespaceAware(true);
-        try {
-            docBuilder = docFactory.newDocumentBuilder();
-        } catch (ParserConfigurationException e) {
-            throw new SystemException("Cannot instantiate CrossRef parser", e);
-        }
     }
 
     /**
      * Try to consolidate some uncertain bibliographical data with crossref web
      * service based on core metadata
      */
     public void findDois() {
+//        XPath xPath = XPathFactory.newInstance().newXPath()
 //        String doi = "";
 //        String aut = "";
 //        String title = "";
@@ -277,7 +268,7 @@ private String getMetadataByDoi(String doi) throws Exception {
     }
 
     private HttpURLConnection openConnection(URL url) {
-        HttpURLConnection  urlConn;
+        HttpURLConnection urlConn;
         try {
             urlConn = (HttpURLConnection) url.openConnection();
         } catch (IOException e) {
@@ -293,10 +284,20 @@ private HttpURLConnection openConnection(URL url) {
     /**
      * Try to consolidate some uncertain bibliographical data with crossref web
      * service based on title and first author.
-     *
      */
     private String queryCrossref(String query) throws Exception {
 
+        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+        docFactory.setValidating(false);
+        //docFactory.setNamespaceAware(true);
+        DocumentBuilder docBuilder = null;
+
+        try {
+            docBuilder = docFactory.newDocumentBuilder();
+        } catch (ParserConfigurationException e) {
+            throw new SystemException("Cannot instantiate CrossRef parser", e);
+        }
+
         String doi = "";
         // we check if the entry is not already in the DB
 

diff --git a/anhalytics-kb/src/main/java/fr/inria/anhalytics/kb/datamine/KnowledgeBaseFeeder.java b/anhalytics-kb/src/main/java/fr/inria/anhalytics/kb/datamine/KnowledgeBaseFeeder.java
@@ -71,6 +71,17 @@ public void initKnowledgeBase() {
             initResult = mm.initObjects(null, MongoFileManager.ONLY_NOT_MINED_INIT_KB_PROCESS);
         }
 
+        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+        docFactory.setValidating(false);
+        //docFactory.setNamespaceAware(true);
+
+        DocumentBuilder docBuilder = null;
+        try {
+            docBuilder = docFactory.newDocumentBuilder();
+        } catch (ParserConfigurationException e) {
+            e.printStackTrace();
+        }
+
         if (initResult) {
             while (mm.hasMore()) {
                 BiblioObject biblioObject = mm.nextBiblioObject();
@@ -81,18 +92,18 @@ public void initKnowledgeBase() {
                 adf.openTransaction();
                 Document teiDoc = null;
                 try {
-                    InputStream teiStream = new ByteArrayInputStream(mm.getTEICorpus(biblioObject).getBytes());
-                    DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
-                    docFactory.setValidating(false);
-                    //docFactory.setNamespaceAware(true);
-                    DocumentBuilder docBuilder = null;
+                    InputStream teiStream = null;
                     try {
-                        docBuilder = docFactory.newDocumentBuilder();
+                        teiStream = new ByteArrayInputStream(mm.getTEICorpus(biblioObject).getBytes());
                         teiDoc = docBuilder.parse(teiStream);
                     } catch (Exception e) {
                         logger.error("Error when parsing TEI stream. ", e);
+                    } finally {
+                        if (teiStream != null) {
+                            teiStream.close();
+                        }
+
                     }
-                    teiStream.close();
 
                     Publication pub = new Publication();
 
@@ -138,7 +149,7 @@ public void initKnowledgeBase() {
                     processPersons(editors, "editor", pub, teiDoc, authorsFromfulltextTeiHeader);
 
                     logger.info("#################################################################");
-                } catch(NumberOfCoAuthorsExceededException e) {
+                } catch (NumberOfCoAuthorsExceededException e) {
                     logger.warn("Skipping publication, number of coauthors are exceeding 30", e);
                     adf.rollback();
                     teiDoc = null;