From 0a9503521364c8ebaf318559cdd8222cec5ee10c Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:02:30 +0200 Subject: [PATCH 1/5] first implementation of EstNLTK analyzer support --- annif/analyzer/__init__.py | 7 +++++ annif/analyzer/estnltk.py | 28 +++++++++++++++++++ pyproject.toml | 2 ++ tests/test_analyzer_estnltk.py | 49 ++++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+) create mode 100644 annif/analyzer/estnltk.py create mode 100644 tests/test_analyzer_estnltk.py diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index 27a2cd792..6efa12ab1 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -46,6 +46,13 @@ def get_analyzer(analyzerspec: str) -> Analyzer: except ImportError: annif.logger.debug("voikko not available, not enabling voikko analyzer") +try: + from . import estnltk + + register_analyzer(estnltk.EstNLTKAnalyzer) +except ImportError: + annif.logger.debug("EstNLTK not available, not enabling estnltk analyzer") + try: from . import spacy diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py new file mode 100644 index 000000000..1836af70e --- /dev/null +++ b/annif/analyzer/estnltk.py @@ -0,0 +1,28 @@ +"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization""" + +from __future__ import annotations + +import annif.util +from annif.exception import OperationFailedException + +from . import analyzer + + +class EstNLTKAnalyzer(analyzer.Analyzer): + name = "estnltk" + + def __init__(self, param: str, **kwargs) -> None: + self.param = param + super().__init__(**kwargs) + + def tokenize_words(self, text: str, filter: bool = True) -> list[str]: + import estnltk + + txt = estnltk.Text(text.strip()) + txt.tag_layer() + lemmas = [ + lemma + for lemma in [l[0] for l in txt.lemma] + if (not filter or self.is_valid_token(lemma)) + ] + return lemmas diff --git a/pyproject.toml b/pyproject.toml index de8410e67..924ae9ba1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1" fasttext-wheel = { version = "0.9.2", optional = true } voikko = { version = "0.5.*", optional = true } +estnltk = { version = "1.7.3", optional = true } tensorflow-cpu = { version = "~2.17.0", optional = true } lmdb = { version = "~1.5.1", optional = true } omikuji = { version = "0.5.*", optional = true } @@ -73,6 +74,7 @@ schemathesis = "3.*.*" [tool.poetry.extras] fasttext = ["fasttext-wheel"] voikko = ["voikko"] +estnltk = ["estnltk"] nn = ["tensorflow-cpu", "lmdb"] omikuji = ["omikuji"] yake = ["yake"] diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py new file mode 100644 index 000000000..611688089 --- /dev/null +++ b/tests/test_analyzer_estnltk.py @@ -0,0 +1,49 @@ +"""Unit tests for EstNLTK analyzer in Annif""" + +import pytest + +import annif.analyzer + +estnltk = pytest.importorskip("annif.analyzer.estnltk") + + +def test_estnltk_tokenize_words(): + analyzer = annif.analyzer.get_analyzer("estnltk") + words = analyzer.tokenize_words( + """ + Aga kõik juhtus iseenesest. Ka köögis oli kõik endine. + """ + ) + assert words == [ + "aga", + "kõik", + "juhtuma", + "iseenesest", + "köök", + "olema", + "kõik", + "endine", + ] + + +def test_estnltk_tokenize_words_no_filter(): + analyzer = annif.analyzer.get_analyzer("estnltk") + words = analyzer.tokenize_words( + """ + Aga kõik juhtus iseenesest. Ka köögis oli kõik endine. + """, + filter=False, + ) + assert words == [ + "aga", + "kõik", + "juhtuma", + "iseenesest", + ".", + "ka", + "köök", + "olema", + "kõik", + "endine", + ".", + ] From 68c2a6ecc8742716f873557ca6c22f587d0f8ebd Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:05:59 +0200 Subject: [PATCH 2/5] add estnltk dependency to CI/CD tests for Python 3.10 --- .github/workflows/cicd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 8172bcc29..e044ec954 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -85,7 +85,7 @@ jobs: fi # For Python 3.10: if [[ ${{ matrix.python-version }} == '3.10' ]]; then - poetry install -E "fasttext spacy"; + poetry install -E "fasttext spacy estnltk"; # download the small English pretrained spaCy model needed by spacy analyzer poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed fi From 74dd477126d0d44071b12e95164af04c49b4a91a Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:13:22 +0200 Subject: [PATCH 3/5] remove unused imports --- annif/analyzer/estnltk.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py index 1836af70e..c320d1701 100644 --- a/annif/analyzer/estnltk.py +++ b/annif/analyzer/estnltk.py @@ -2,9 +2,6 @@ from __future__ import annotations -import annif.util -from annif.exception import OperationFailedException - from . import analyzer From a076607f6fd64c0d915cc728f814f1310e0a462b Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:13:40 +0200 Subject: [PATCH 4/5] fix test for estnltk install --- tests/test_analyzer_estnltk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py index 611688089..5449efcfc 100644 --- a/tests/test_analyzer_estnltk.py +++ b/tests/test_analyzer_estnltk.py @@ -4,7 +4,7 @@ import annif.analyzer -estnltk = pytest.importorskip("annif.analyzer.estnltk") +estnltk = pytest.importorskip("estnltk") def test_estnltk_tokenize_words(): From d2a005199ff33b8e328f54eba19505baa4f7e46a Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:14:41 +0200 Subject: [PATCH 5/5] refactor code to avoid flake8 warning --- annif/analyzer/estnltk.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py index c320d1701..9829496ed 100644 --- a/annif/analyzer/estnltk.py +++ b/annif/analyzer/estnltk.py @@ -17,9 +17,8 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]: txt = estnltk.Text(text.strip()) txt.tag_layer() - lemmas = [ + return [ lemma - for lemma in [l[0] for l in txt.lemma] + for lemma in [lemmas[0] for lemmas in txt.lemma] if (not filter or self.is_valid_token(lemma)) ] - return lemmas