diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index ef1d2c98f..6d1c771dd 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -49,7 +49,7 @@ jobs: # Selectively install the optional dependencies for some Python versions # For Python 3.8: if [[ ${{ matrix.python-version }} == '3.8' ]]; then - poetry install -E "nn omikuji yake voikko pycld3"; + poetry install -E "nn omikuji yake voikko lingua"; fi # For Python 3.9: if [[ ${{ matrix.python-version }} == '3.9' ]]; then @@ -62,7 +62,6 @@ jobs: poetry install -E "nn omikuji yake"; fi poetry run python -m nltk.downloader punkt - - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/Dockerfile b/Dockerfile index 5d3475e4b..304a09c49 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.8-slim-bullseye LABEL maintainer="Juho Inkinen " SHELL ["/bin/bash", "-c"] -ARG optional_dependencies="fasttext voikko pycld3 fasttext nn omikuji yake spacy" +ARG optional_dependencies="fasttext voikko lingua fasttext nn omikuji yake spacy" ARG POETRY_VIRTUALENVS_CREATE=false # Install system dependencies needed at runtime: diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py index 61b9a6c59..995fce172 100644 --- a/annif/transform/__init__.py +++ b/annif/transform/__init__.py @@ -48,4 +48,4 @@ def get_transform(transform_specs, project): _transforms.update({langfilter.LangFilter.name: langfilter.LangFilter}) except ImportError: annif.logger.debug( - "pycld3 not available, not enabling filter_language transform") + "Lingua not available, not enabling filter_language transform") diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index f63f6f97a..68f6ef9ec 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -2,7 +2,7 @@ different from the language of the project.""" import annif -import cld3 +import lingua from . import transform logger = annif.logger @@ -16,14 +16,20 @@ def __init__(self, project, text_min_length=500, sentence_min_length=50): super().__init__(project) self.text_min_length = int(text_min_length) self.sentence_min_length = int(sentence_min_length) + self.detector = ( + lingua.LanguageDetectorBuilder + .from_all_languages() + .with_low_accuracy_mode() + .build() + ) def _detect_language(self, text): """Tries to detect the language of a text input. Outputs a BCP-47-style language code (e.g. 'en').""" - lan_info = cld3.get_language(text) - if lan_info is not None and lan_info.is_reliable: - return lan_info.language + lan_info = self.detector.detect_language_of(text) + if lan_info is not None: + return lan_info.iso_code_639_1.name.lower() else: return None diff --git a/pyproject.toml b/pyproject.toml index 52e1d4a5d..c09cf09e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ tensorflow-cpu = {version = "2.9.1", optional = true} lmdb = {version = "1.3.0", optional = true} omikuji = {version = "0.5.*", optional = true} yake = {version = "0.4.5", optional = true} -pycld3 = {version = "*", optional = true} +lingua-language-detector = {version = "1.1.3", optional = true} spacy = {version = "3.3.*", optional = true} [tool.poetry.dev-dependencies] @@ -79,7 +79,7 @@ voikko = ["voikko"] nn = ["tensorflow-cpu", "lmdb"] omikuji = ["omikuji"] yake = ["yake"] -pycld3 = ["pycld3"] +lingua = ["lingua-language-detector"] spacy = ["spacy"] [tool.poetry.scripts] diff --git a/tests/test_transform_langfilter.py b/tests/test_transform_langfilter.py index b3a2f2d90..3dade8c0b 100644 --- a/tests/test_transform_langfilter.py +++ b/tests/test_transform_langfilter.py @@ -32,7 +32,6 @@ def test_lang_filter(project): Kansalliskirjasto on kaikille avoin kulttuuriperintöorganisaatio, joka palvelee valtakunnallisesti kansalaisia, tiedeyhteisöjä ja muita yhteiskunnan toimijoita. - Abc defghij klmnopqr stuwxyz abc defghij klmnopqr stuwxyz. Turvaamme Suomessa julkaistun tai Suomea koskevan julkaistun kulttuuriperinnön saatavuuden sekä välittämme ja tuotamme tietosisältöjä tutkimukselle, opiskelulle, kansalaisille ja