From 1688ec28c8bceb2419d8904117bd1ee10b28ef0b Mon Sep 17 00:00:00 2001 From: Mathijs de Bruin Date: Thu, 28 Dec 2017 17:08:12 +0000 Subject: [PATCH] Language detection. --- pom.xml | 33 +++++++++++++++++++ .../java/com/ipfssearch/ipfstika/App.java | 15 ++++++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index f094eb6..a95f8b0 100644 --- a/pom.xml +++ b/pom.xml @@ -61,6 +61,12 @@ tika-parsers 1.17 + + + org.apache.tika + tika-langdetect + 1.17 + com.google.code.gson @@ -75,5 +81,32 @@ 1.7.25 compile + + + org.xerial + sqlite-jdbc + 3.21.0.1 + + + + com.levigo.jbig2 + levigo-jbig2-imageio + 2.0 + test + + + com.github.jai-imageio + jai-imageio-core + 1.3.1 + test + + + com.github.jai-imageio + jai-imageio-jpeg2000 + 1.3.0 + test + diff --git a/src/main/java/com/ipfssearch/ipfstika/App.java b/src/main/java/com/ipfssearch/ipfstika/App.java index 4954771..3fdc712 100644 --- a/src/main/java/com/ipfssearch/ipfstika/App.java +++ b/src/main/java/com/ipfssearch/ipfstika/App.java @@ -20,6 +20,8 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.language.detect.LanguageHandler; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.Link; import org.apache.tika.exception.TikaException; @@ -114,9 +116,11 @@ private String getResponse(String path) throws IOException { AutoDetectParser parser = new AutoDetectParser(); LinkContentHandler link_handler = new LinkContentHandler(); BodyContentHandler body_handler = new BodyContentHandler(10*1024*1024); - // This causes weird crashes - // LanguageHandler language_handler = new LanguageHandler(); - TeeContentHandler handler = new TeeContentHandler(link_handler, body_handler); + + LanguageDetector language_detector = new OptimaizeLangDetector().loadModels(); + LanguageHandler language_handler = new LanguageHandler(language_detector); + + TeeContentHandler handler = new TeeContentHandler(link_handler, body_handler, language_handler); Metadata metadata = new Metadata(); // Set filename from path string @@ -141,16 +145,17 @@ private String getResponse(String path) throws IOException { /* Now return JSON with: { + "metadata": metadata, "language": language_handler.getLanguage(), "content": body_handler.toString(), - "links": links, - "metadata": metadata + "urls": links, } */ Gson gson = new Gson(); JsonObject output_json = gson.toJsonTree(metadata).getAsJsonObject(); output_json.add("content", gson.toJsonTree(body_handler.toString().trim())); output_json.add("urls", gson.toJsonTree(links)); + output_json.add("language", gson.toJsonTree(language_handler.getLanguage())); return output_json.toString(); }