From ce6f0ca6abb0b13245a91d585b62b456bf0e7e09 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 5 Nov 2024 17:25:23 +0100 Subject: [PATCH] add rules for Estonian --- simplemma/strategies/defaultrules/__init__.py | 2 ++ simplemma/strategies/defaultrules/et.py | 29 +++++++++++++++++++ tests/strategies/defaultrules/test_rules.py | 10 ++++++- 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 simplemma/strategies/defaultrules/et.py diff --git a/simplemma/strategies/defaultrules/__init__.py b/simplemma/strategies/defaultrules/__init__.py index cd2344f..2107dfd 100644 --- a/simplemma/strategies/defaultrules/__init__.py +++ b/simplemma/strategies/defaultrules/__init__.py @@ -4,6 +4,7 @@ from .de import apply_de from .en import apply_en +from .et import apply_et from .fi import apply_fi from .nl import apply_nl from .pl import apply_pl @@ -12,6 +13,7 @@ DEFAULT_RULES: Dict[str, Callable[[str], Optional[str]]] = { "de": apply_de, "en": apply_en, + "et": apply_et, "fi": apply_fi, "nl": apply_nl, "pl": apply_pl, diff --git a/simplemma/strategies/defaultrules/et.py b/simplemma/strategies/defaultrules/et.py new file mode 100644 index 0000000..d798163 --- /dev/null +++ b/simplemma/strategies/defaultrules/et.py @@ -0,0 +1,29 @@ +import re +from typing import Optional + +from .generic import apply_rules + +DEFAULT_RULES = { + # adjectives + # https://en.wiktionary.org/wiki/-line + re.compile( + r"(?:lise|list|lisse|lisesse|lises|lisest|lisele|lisel|liselt|liseks|liseni|lisena|liseta|lisega|lised|liste|lisi|listesse|lisisse|listes|lisis|listest|lisist|listele|lisile|listel|lisil|listelt|lisilt|listeks|lisiks|listeni|listena|listeta|listega)$" + ): "line", + # nouns + # https://en.wiktionary.org/wiki/-kond + re.compile( + r"(?:konna|konda|konnasse|konnas|konnast|konnale|konnal|konnalt|konnaks|konnani|konnana|konnata|konnaga|konnad|kondade|kondi|kondasid|kondadesse|konnisse|kondades|konnis|kondadest|konnist|kondadele|konnile|kondadel|konnil|kondadelt|konnilt|kondadeks|konniks|kondadeni|kondadena|kondadeta|kondadega)$" + ): "kond", + # https://en.wiktionary.org/wiki/-nik + re.compile( + r"(?:niku|nikku|nikusse|nikus|nikust|nikule|nikul|nikult|nikuks|nikuni|nikuna|nikuta|nikuga|nikud|nike|nikudde|nikke|nikusid|nikesse|nikkudesse|nikes|nikkudes|nikest|nikkudest|nikele|nikkudele|nikel|nikkudel|nikelt|nikkudelt|nikeks|nikkudeks|nikeni|nikkudeni|nikena|nikkudena|niketa|nikkudeta|nikega|nikkudega)$" + ): "nik", +} + + +def apply_et(token: str) -> Optional[str]: + "Apply pre-defined rules for Estonian." + if len(token) < 10 or token[0].isupper(): + return None + + return apply_rules(token, DEFAULT_RULES) diff --git a/tests/strategies/defaultrules/test_rules.py b/tests/strategies/defaultrules/test_rules.py index 37ea474..bc6c61f 100644 --- a/tests/strategies/defaultrules/test_rules.py +++ b/tests/strategies/defaultrules/test_rules.py @@ -4,11 +4,19 @@ def test_DEFAULT_RULES() -> None: """Test rules on all available languages.""" rules_strategy = RulesStrategy() + assert rules_strategy.get_lemma("Pfifferlinge", "de") == "Pfifferling" - assert rules_strategy.get_lemma("Pfifferlinge", "en") is None assert rules_strategy.get_lemma("atonements", "de") is None + assert rules_strategy.get_lemma("atonements", "en") == "atonement" + assert rules_strategy.get_lemma("Pfifferlinge", "en") is None + assert rules_strategy.get_lemma("brieven", "nl") == "brief" + assert rules_strategy.get_lemma("liikenaisessa", "fi") == "liikenainen" + assert rules_strategy.get_lemma("pracowaliście", "pl") == "pracować" + assert rules_strategy.get_lemma("безгра́мотностью", "ru") == "безгра́мотность" + + assert rules_strategy.get_lemma("keelkondade", "et") == "keelkond"