Skip to content
This repository has been archived by the owner on Dec 31, 2021. It is now read-only.

Commit

Permalink
Language detection.
Browse files Browse the repository at this point in the history
  • Loading branch information
dokterbob committed Dec 28, 2017
1 parent 0d95017 commit 1688ec2
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 5 deletions.
33 changes: 33 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@
<artifactId>tika-parsers</artifactId>
<version>1.17</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-langdetect -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-langdetect</artifactId>
<version>1.17</version>
</dependency>
<!-- Gson: Java to Json conversion -->
<dependency>
<groupId>com.google.code.gson</groupId>
Expand All @@ -75,5 +81,32 @@
<version>1.7.25</version>
<scope>compile</scope>
</dependency>
<!-- sqlite used by tika, somehow
https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc
-->
<dependency>
<groupId>org.xerial</groupId>
<artifactId>sqlite-jdbc</artifactId>
<version>3.21.0.1</version>
</dependency>
<!-- optional dependencies -->
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
<version>2.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.3.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-jpeg2000</artifactId>
<version>1.3.0</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
15 changes: 10 additions & 5 deletions src/main/java/com/ipfssearch/ipfstika/App.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.language.detect.LanguageHandler;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.langdetect.OptimaizeLangDetector;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.Link;
import org.apache.tika.exception.TikaException;
Expand Down Expand Up @@ -114,9 +116,11 @@ private String getResponse(String path) throws IOException {
AutoDetectParser parser = new AutoDetectParser();
LinkContentHandler link_handler = new LinkContentHandler();
BodyContentHandler body_handler = new BodyContentHandler(10*1024*1024);
// This causes weird crashes
// LanguageHandler language_handler = new LanguageHandler();
TeeContentHandler handler = new TeeContentHandler(link_handler, body_handler);

LanguageDetector language_detector = new OptimaizeLangDetector().loadModels();
LanguageHandler language_handler = new LanguageHandler(language_detector);

TeeContentHandler handler = new TeeContentHandler(link_handler, body_handler, language_handler);
Metadata metadata = new Metadata();

// Set filename from path string
Expand All @@ -141,16 +145,17 @@ private String getResponse(String path) throws IOException {

/* Now return JSON with:
{
"metadata": metadata,
"language": language_handler.getLanguage(),
"content": body_handler.toString(),
"links": links,
"metadata": metadata
"urls": links,
}
*/
Gson gson = new Gson();
JsonObject output_json = gson.toJsonTree(metadata).getAsJsonObject();
output_json.add("content", gson.toJsonTree(body_handler.toString().trim()));
output_json.add("urls", gson.toJsonTree(links));
output_json.add("language", gson.toJsonTree(language_handler.getLanguage()));

return output_json.toString();
}
Expand Down

0 comments on commit 1688ec2

Please sign in to comment.