NatLibFi · osma · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -85,7 +85,7 @@ jobs:
         fi
         # For Python 3.10:
         if [[ ${{ matrix.python-version }} == '3.10' ]]; then
-          poetry install -E "fasttext spacy";
+          poetry install -E "fasttext spacy estnltk";
           # download the small English pretrained spaCy model needed by spacy analyzer
           poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
         fi

diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py
@@ -46,6 +46,13 @@
 except ImportError:
     annif.logger.debug("voikko not available, not enabling voikko analyzer")
 
+try:
+    from . import estnltk
+
+    register_analyzer(estnltk.EstNLTKAnalyzer)
+except ImportError:
+    annif.logger.debug("EstNLTK not available, not enabling estnltk analyzer")
+
 try:
     from . import spacy
 

diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py
@@ -0,0 +1,24 @@
+"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization"""
+
+from __future__ import annotations
+
+from . import analyzer
+
+
+class EstNLTKAnalyzer(analyzer.Analyzer):
+    name = "estnltk"
+
+    def __init__(self, param: str, **kwargs) -> None:
+        self.param = param
+        super().__init__(**kwargs)
+
+    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
+        import estnltk
+
+        txt = estnltk.Text(text.strip())
+        txt.tag_layer()
+        return [
+            lemma
+            for lemma in [lemmas[0] for lemmas in txt.lemma]
+            if (not filter or self.is_valid_token(lemma))
+        ]
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1"
 
 fasttext-wheel = { version = "0.9.2", optional = true }
 voikko = { version = "0.5.*", optional = true }
+estnltk = { version = "1.7.3", optional = true }
 tensorflow-cpu = { version = "~2.17.0", optional = true }
 lmdb = { version = "~1.5.1", optional = true }
 omikuji = { version = "0.5.*", optional = true }
@@ -73,6 +74,7 @@ schemathesis = "3.*.*"
 [tool.poetry.extras]
 fasttext = ["fasttext-wheel"]
 voikko = ["voikko"]
+estnltk = ["estnltk"]
 nn = ["tensorflow-cpu", "lmdb"]
 omikuji = ["omikuji"]
 yake = ["yake"]

diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py
@@ -0,0 +1,49 @@
+"""Unit tests for EstNLTK analyzer in Annif"""
+
+import pytest
+
+import annif.analyzer
+
+estnltk = pytest.importorskip("estnltk")
+
+
+def test_estnltk_tokenize_words():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+    ]
+
+
+def test_estnltk_tokenize_words_no_filter():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """,
+        filter=False,
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        ".",
+        "ka",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+        ".",
+    ]