From 0a9503521364c8ebaf318559cdd8222cec5ee10c Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 12 Nov 2024 21:02:30 +0200
Subject: [PATCH 1/5] first implementation of EstNLTK analyzer support

---
 annif/analyzer/__init__.py     |  7 +++++
 annif/analyzer/estnltk.py      | 28 +++++++++++++++++++
 pyproject.toml                 |  2 ++
 tests/test_analyzer_estnltk.py | 49 ++++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+)
 create mode 100644 annif/analyzer/estnltk.py
 create mode 100644 tests/test_analyzer_estnltk.py

diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py
index 27a2cd792..6efa12ab1 100644
--- a/annif/analyzer/__init__.py
+++ b/annif/analyzer/__init__.py
@@ -46,6 +46,13 @@ def get_analyzer(analyzerspec: str) -> Analyzer:
 except ImportError:
     annif.logger.debug("voikko not available, not enabling voikko analyzer")
 
+try:
+    from . import estnltk
+
+    register_analyzer(estnltk.EstNLTKAnalyzer)
+except ImportError:
+    annif.logger.debug("EstNLTK not available, not enabling estnltk analyzer")
+
 try:
     from . import spacy
 
diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py
new file mode 100644
index 000000000..1836af70e
--- /dev/null
+++ b/annif/analyzer/estnltk.py
@@ -0,0 +1,28 @@
+"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization"""
+
+from __future__ import annotations
+
+import annif.util
+from annif.exception import OperationFailedException
+
+from . import analyzer
+
+
+class EstNLTKAnalyzer(analyzer.Analyzer):
+    name = "estnltk"
+
+    def __init__(self, param: str, **kwargs) -> None:
+        self.param = param
+        super().__init__(**kwargs)
+
+    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
+        import estnltk
+
+        txt = estnltk.Text(text.strip())
+        txt.tag_layer()
+        lemmas = [
+            lemma
+            for lemma in [l[0] for l in txt.lemma]
+            if (not filter or self.is_valid_token(lemma))
+        ]
+        return lemmas
diff --git a/pyproject.toml b/pyproject.toml
index de8410e67..924ae9ba1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1"
 
 fasttext-wheel = { version = "0.9.2", optional = true }
 voikko = { version = "0.5.*", optional = true }
+estnltk = { version = "1.7.3", optional = true }
 tensorflow-cpu = { version = "~2.17.0", optional = true }
 lmdb = { version = "~1.5.1", optional = true }
 omikuji = { version = "0.5.*", optional = true }
@@ -73,6 +74,7 @@ schemathesis = "3.*.*"
 [tool.poetry.extras]
 fasttext = ["fasttext-wheel"]
 voikko = ["voikko"]
+estnltk = ["estnltk"]
 nn = ["tensorflow-cpu", "lmdb"]
 omikuji = ["omikuji"]
 yake = ["yake"]
diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py
new file mode 100644
index 000000000..611688089
--- /dev/null
+++ b/tests/test_analyzer_estnltk.py
@@ -0,0 +1,49 @@
+"""Unit tests for EstNLTK analyzer in Annif"""
+
+import pytest
+
+import annif.analyzer
+
+estnltk = pytest.importorskip("annif.analyzer.estnltk")
+
+
+def test_estnltk_tokenize_words():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+    ]
+
+
+def test_estnltk_tokenize_words_no_filter():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """,
+        filter=False,
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        ".",
+        "ka",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+        ".",
+    ]

From 68c2a6ecc8742716f873557ca6c22f587d0f8ebd Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 12 Nov 2024 21:05:59 +0200
Subject: [PATCH 2/5] add estnltk dependency to CI/CD tests for Python 3.10

---
 .github/workflows/cicd.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
index 8172bcc29..e044ec954 100644
--- a/.github/workflows/cicd.yml
+++ b/.github/workflows/cicd.yml
@@ -85,7 +85,7 @@ jobs:
         fi
         # For Python 3.10:
         if [[ ${{ matrix.python-version }} == '3.10' ]]; then
-          poetry install -E "fasttext spacy";
+          poetry install -E "fasttext spacy estnltk";
           # download the small English pretrained spaCy model needed by spacy analyzer
           poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
         fi

From 74dd477126d0d44071b12e95164af04c49b4a91a Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 12 Nov 2024 21:13:22 +0200
Subject: [PATCH 3/5] remove unused imports

---
 annif/analyzer/estnltk.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py
index 1836af70e..c320d1701 100644
--- a/annif/analyzer/estnltk.py
+++ b/annif/analyzer/estnltk.py
@@ -2,9 +2,6 @@
 
 from __future__ import annotations
 
-import annif.util
-from annif.exception import OperationFailedException
-
 from . import analyzer
 
 

From a076607f6fd64c0d915cc728f814f1310e0a462b Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 12 Nov 2024 21:13:40 +0200
Subject: [PATCH 4/5] fix test for estnltk install

---
 tests/test_analyzer_estnltk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py
index 611688089..5449efcfc 100644
--- a/tests/test_analyzer_estnltk.py
+++ b/tests/test_analyzer_estnltk.py
@@ -4,7 +4,7 @@
 
 import annif.analyzer
 
-estnltk = pytest.importorskip("annif.analyzer.estnltk")
+estnltk = pytest.importorskip("estnltk")
 
 
 def test_estnltk_tokenize_words():

From d2a005199ff33b8e328f54eba19505baa4f7e46a Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 12 Nov 2024 21:14:41 +0200
Subject: [PATCH 5/5] refactor code to avoid flake8 warning

---
 annif/analyzer/estnltk.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py
index c320d1701..9829496ed 100644
--- a/annif/analyzer/estnltk.py
+++ b/annif/analyzer/estnltk.py
@@ -17,9 +17,8 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
 
         txt = estnltk.Text(text.strip())
         txt.tag_layer()
-        lemmas = [
+        return [
             lemma
-            for lemma in [l[0] for l in txt.lemma]
+            for lemma in [lemmas[0] for lemmas in txt.lemma]
             if (not filter or self.is_valid_token(lemma))
         ]
-        return lemmas