Update to Elasticsearch 8.3.3

This also meant to remove the lingua implementation due to issues with the security manager.
spinscale · Aug 22, 2022 · 4f893a1 · 4f893a1
1 parent 5dbec5e
commit 4f893a1
Show file tree

Hide file tree

Showing 8 changed files with 27 additions and 118 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM docker.elastic.co/elasticsearch/elasticsearch:8.3.2
+FROM docker.elastic.co/elasticsearch/elasticsearch:8.3.3
 
 ADD build/distribution/elasticsearch-ingest-langdetect.zip /elasticsearch-ingest-langdetect.zip
 RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch file:///elasticsearch-ingest-langdetect.zip

diff --git a/README.md b/README.md
@@ -1,15 +1,21 @@
 # Elasticsearch Langdetect Ingest Processor
 
-Uses the [langdetect](https://github.com/YouCruit/language-detection/) plugin (or alternatively [lingua](https://github.com/pemistahl/lingua/)) to try to find out the language used in a field.
+Uses the [langdetect](https://github.com/YouCruit/language-detection/) plugin.
 
 Note that Elasticsearch has native support for langdetection nowadays using the
 `inference` ingest processor. See more in
 [the documentation](https://www.elastic.co/guide/en/machine-learning/current/ml-lang-ident.html)
 
+**Note**: As of Elasticsearch 8.3.3 the lingua implementation has been
+removed again due to issues with the security manager. Feel free to check
+out previous commits and create a PR if you got it working to include it
+again.
+
 ## Installation
 
 | ES    | Command |
 | ----- | ------- |
+| 8.3.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-langdetect/releases/download/8.3.3.1/ingest-langdetect-8.3.3.1.zip` |
 | 8.3.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-langdetect/releases/download/8.3.2.1/ingest-langdetect-8.3.2.1.zip` |
 | 8.3.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-langdetect/releases/download/8.3.1.1/ingest-langdetect-8.3.1.1.zip` |
 | 8.3.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-langdetect/releases/download/8.3.0.1/ingest-langdetect-8.3.0.1.zip` |
@@ -227,12 +233,6 @@ GET my-index/doc/2
 | target_field   | Field name to write the language to |
 | max_length     | Max length of of characters to read, defaults to 10kb, requires a byte size value, like 1mb |
 | ignore_missing | Ignore missing source field. Not throwing exception in that case. Expects for boolean value, defaults to false. |
-| implementation | **Exists only from 8.0 onwards**: Can be 'lingua' to use the [lingua](https://github.com/pemistahl/lingua/) language detector library, everything else defaults to use the [langdetect](https://github.com/YouCruit/language-detection/) code. |
-
-**Note**: The `lingua` implementation requires **a lot more** more memory
-for your nodes having the ingest role. Please test this before using in
-production. The memory is only used, once the `lingua` processor is actually
-used in a pipeline.
 
 ## Setup
 

diff --git a/build.gradle b/build.gradle
@@ -4,7 +4,7 @@ import org.apache.tools.ant.filters.ReplaceTokens
 plugins {
   // the old co.riiid.gradle is not gradle 7.0 compatible
   id "com.github.humblerookie.gradle" version "0.4.4"
-  id "com.github.ben-manes.versions" version '0.41.0'
+  id "com.github.ben-manes.versions" version '0.42.0'
 }
 
 repositories {
@@ -35,6 +35,7 @@ task copyDependencies(type: Copy) {
   from configurations.default
   from 'NOTICE.txt'
   from 'LICENSE.txt'
+  from 'src/main/resources/plugin-security.policy'
 }
 
 task packageDistribution(type: Zip) {
@@ -109,21 +110,20 @@ githubRelease.doFirst {
 githubRelease.dependsOn 'packageDistribution'
 
 dependencies {
-  def junitVersion = '5.8.2'
+  def junitVersion = '5.9.0'
 
   implementation 'com.youcruit.com.cybozu.labs:langdetect:1.1.2-20151117'
-  implementation 'com.github.pemistahl:lingua:1.1.1'
   implementation 'net.arnx:jsonic:1.3.10'
   compileOnly "org.elasticsearch:elasticsearch:$elasticsearchVersion"
 
   testImplementation "org.elasticsearch:elasticsearch:$elasticsearchVersion"
   testImplementation "co.elastic.clients:elasticsearch-java:$elasticsearchVersion"
-  testImplementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.2'
-  testImplementation('org.testcontainers:elasticsearch:1.17.1') {
+  testImplementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
+  testImplementation('org.testcontainers:elasticsearch:1.17.3') {
     exclude group: 'junit', module: 'junit'
   }
-  testImplementation 'org.testcontainers:junit-jupiter:1.17.1'
-  testImplementation "org.assertj:assertj-core:3.22.0"
+  testImplementation 'org.testcontainers:junit-jupiter:1.17.3'
+  testImplementation 'org.assertj:assertj-core:3.23.1'
   testImplementation "org.slf4j:slf4j-simple:1.7.36"
   testImplementation "org.junit.jupiter:junit-jupiter-api:${junitVersion}"
   testImplementation "org.junit.jupiter:junit-jupiter-params:${junitVersion}"

diff --git a/gradle.properties b/gradle.properties
@@ -1,2 +1 @@
-elasticsearchVersion = 8.3.2
-org.gradle.jvmargs=-Xmx4g
+elasticsearchVersion = 8.3.3
diff --git a/src/main/java/org/elasticsearch/plugin/ingest/langdetect/IngestLangDetectPlugin.java b/src/main/java/org/elasticsearch/plugin/ingest/langdetect/IngestLangDetectPlugin.java
@@ -19,8 +19,6 @@
 
 import com.cybozu.labs.langdetect.LangDetectException;
 import com.cybozu.labs.langdetect.SecureDetectorFactory;
-import com.github.pemistahl.lingua.api.LanguageDetector;
-import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ingest.Processor;
 import org.elasticsearch.plugins.IngestPlugin;
@@ -30,13 +28,9 @@
 import java.net.URISyntaxException;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.concurrent.atomic.AtomicReference;
-import java.util.function.Supplier;
 
 public class IngestLangDetectPlugin extends Plugin implements IngestPlugin {
 
-    private AtomicReference<LanguageDetector> languageDetector = new AtomicReference<>();
-
     @Override
     public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
         try {
@@ -45,32 +39,8 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
             throw new ElasticsearchException(e);
         }
 
-        // this lazy loads the lingua supplier, as it needs crazy amounts of memory, which should only be used, if the user uses
-        // the lingua implementation in one of the processors
-        Supplier<LanguageDetector> supplier = () -> {
-            final LanguageDetector languageDetector = this.languageDetector.get();
-            if (languageDetector == null) {
-                final LanguageDetector detector = LanguageDetectorBuilder.fromAllLanguages().withPreloadedLanguageModels().build();
-                final boolean updatedSuccessfully = this.languageDetector.compareAndSet(null, detector);
-                if (updatedSuccessfully == false) {
-                    detector.destroy();
-                }
-                return this.languageDetector.get();
-            }
-            return languageDetector;
-        };
-
         Map<String, Processor.Factory> factoryMap = new HashMap<>(1);
-        factoryMap.put(LangDetectProcessor.TYPE, new LangDetectProcessor.Factory(supplier));
+        factoryMap.put(LangDetectProcessor.TYPE, new LangDetectProcessor.Factory());
         return factoryMap;
     }
-
-    @Override
-    public void close() throws IOException {
-        super.close();
-        final LanguageDetector detector = this.languageDetector.get();
-        if (detector != null) {
-            detector.destroy();
-        }
-    }
 }
diff --git a/src/main/java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessor.java b/src/main/java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessor.java
@@ -19,8 +19,6 @@
 
 import com.cybozu.labs.langdetect.Detector;
 import com.cybozu.labs.langdetect.DetectorFactory;
-import com.github.pemistahl.lingua.api.Language;
-import com.github.pemistahl.lingua.api.LanguageDetector;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
@@ -29,13 +27,9 @@
 import org.elasticsearch.ingest.IngestDocument;
 import org.elasticsearch.ingest.Processor;
 
-import java.util.Locale;
 import java.util.Map;
-import java.util.function.Supplier;
 
-import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty;
-import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalStringProperty;
-import static org.elasticsearch.ingest.ConfigurationUtils.readStringProperty;
+import static org.elasticsearch.ingest.ConfigurationUtils.*;
 
 public class LangDetectProcessor extends AbstractProcessor {
 
@@ -84,40 +78,22 @@ public String getType() {
     public static final class Factory implements Processor.Factory {
 
         private static final ByteSizeValue DEFAULT_MAX_LENGTH = new ByteSizeValue(10, ByteSizeUnit.KB);
-        private final Supplier<LanguageDetector> languageDetector;
-
-        public Factory(Supplier<LanguageDetector> languageDetector) {
-            this.languageDetector = languageDetector;
-        }
 
         @Override
         public Processor create(Map<String, Processor.Factory> processorFactories, String tag, String description,
                                 Map<String, Object> config) throws Exception {
             String field = readStringProperty(TYPE, tag, config, "field");
             String targetField = readStringProperty(TYPE, tag, config, "target_field");
             String maxLengthStr = readOptionalStringProperty(TYPE, tag, config, "max_length");
-            String implementation = readOptionalStringProperty(TYPE, tag, config, "implementation");
             ByteSizeValue maxLength = ByteSizeValue.parseBytesSizeValue(maxLengthStr, DEFAULT_MAX_LENGTH, "max_length");
             boolean ignoreMissing = readBooleanProperty(TYPE, tag, config, "ignore_missing", false);
 
-            CheckedFunction<String, String, Exception> langDetector;
-            if ("lingua".equals(implementation)) {
-                langDetector = input -> {
-                    if (maxLength != null && input.length() > maxLength.getBytes()) {
-                        input = input.substring(0, Long.valueOf(maxLength.getBytes()).intValue());
-                    }
-
-                    Language detectedLanguage = languageDetector.get().detectLanguageOf(input);
-                    return detectedLanguage.getIsoCode639_1().name().toLowerCase(Locale.ROOT);
-                };
-            } else {
-                langDetector = input -> {
-                    Detector detector = DetectorFactory.create();
-                    detector.setMaxTextLength(Long.valueOf(maxLength.getBytes()).intValue());
-                    detector.append(input);
-                    return detector.detect();
-                };
-            }
+            CheckedFunction<String, String, Exception> langDetector = input -> {
+                Detector detector = DetectorFactory.create();
+                detector.setMaxTextLength(Long.valueOf(maxLength.getBytes()).intValue());
+                detector.append(input);
+                return detector.detect();
+            };
 
             return new LangDetectProcessor(tag, description, field, targetField, ignoreMissing, langDetector);
         }

diff --git a/.../java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessorIntegrationTests.java b/.../java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessorIntegrationTests.java
@@ -11,11 +11,7 @@
 import co.elastic.clients.transport.rest_client.RestClientTransport;
 import org.apache.http.HttpHost;
 import org.elasticsearch.client.RestClient;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Tag;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
 import org.slf4j.LoggerFactory;
 import org.testcontainers.containers.GenericContainer;
 import org.testcontainers.containers.output.Slf4jLogConsumer;
@@ -47,7 +43,7 @@ public static void startContainer() {
         container.withEnv("xpack.security.enabled", "false");
         container.withEnv("ES_JAVA_OPTS", "-Xms4g -Xmx4g");
         container.addExposedPorts(9200);
-        container.setWaitStrategy(new LogMessageWaitStrategy().withRegEx(".*(\"message\":\\s?\"started[\"| ].*|] started\n$)"));
+        container.setWaitStrategy(new LogMessageWaitStrategy().withRegEx(".*(\"message\":\\s?\"started[\\s?|\"].*|] started\n$)"));
 
         container.start();
         container.followOutput(new Slf4jLogConsumer(LoggerFactory.getLogger(LangDetectProcessorIntegrationTests.class)));
@@ -92,13 +88,6 @@ public void testLangDetectProcessorInPipeline() throws Exception {
               "field" : "field1",
               "target_field" : "field1_language"
             }
-          },
-          {
-            "langdetect" : {
-              "field" : "field1",
-              "target_field" : "field1_lingua",
-              "implementation" : "lingua"
-            }
           }
         ]
       }
@@ -122,6 +111,5 @@ public void testLangDetectProcessorInPipeline() throws Exception {
         GetResponse<Map> getResponse = client.get(b -> b.index("test").id("1"), Map.class);
         Map<String, Object> source = getResponse.source();
         assertThat(source).containsEntry("field1_language", "en");
-        assertThat(source).containsEntry("field1_lingua", "en");
     }
 }
diff --git a/src/test/java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessorTests.java b/src/test/java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessorTests.java
@@ -19,14 +19,10 @@
 
 import com.cybozu.labs.langdetect.LangDetectException;
 import com.cybozu.labs.langdetect.SecureDetectorFactory;
-import com.github.pemistahl.lingua.api.Language;
-import com.github.pemistahl.lingua.api.LanguageDetector;
-import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.ingest.IngestDocument;
 import org.elasticsearch.ingest.Processor;
-import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
@@ -41,8 +37,6 @@
 
 public class LangDetectProcessorTests {
 
-    private static LanguageDetector languageDetector;
-
     @TempDir
     public static Path folder;
 
@@ -51,14 +45,6 @@ public static void loadProfiles() throws Exception {
         Settings settings = Settings.builder().put("path.home", folder).build();
         Environment environment = new Environment(settings, folder);
         SecureDetectorFactory.loadProfileFromClassPath(environment);
-
-        // instead of loading all languages, reduce this to the minimum to keep the test fast!
-        languageDetector = LanguageDetectorBuilder.fromLanguages(Language.ENGLISH, Language.GERMAN).build();
-    }
-
-    @AfterAll
-    public static void stopLanguageDetector() {
-        languageDetector.destroy();
     }
 
     @Test
@@ -69,16 +55,6 @@ public void testThatProcessorWorks() throws Exception {
         assertThat(data).containsEntry("language", "en");
     }
 
-    @Test
-    public void testThatLinguaImplementationWorks() throws Exception {
-        final Map<String, Object> config = config("source_field", "language", false);
-        config.put("implementation", "lingua");
-        Map<String, Object> data = ingestDocument(config,
-                "source_field", "This is hopefully an english text, that will be detected.");
-
-        assertThat(data).containsEntry("language", "en");
-    }
-
     @Test
     public void testMaxLengthConfiguration() throws Exception {
         Map<String, Object> config = config("source_field", "language", false);
@@ -126,7 +102,7 @@ private Map<String, Object> ingestDocument(Map<String, Object> config, String fi
         document.put(field, value);
         IngestDocument ingestDocument = new IngestDocument(document, Collections.emptyMap());
 
-        Processor processor = new LangDetectProcessor.Factory(() -> languageDetector)
+        Processor processor = new LangDetectProcessor.Factory()
                 .create(Collections.emptyMap(), "my-tag", "desc", config);
         return processor.execute(ingestDocument).getSourceAndMetadata();
     }