From 436d4ed5c857d879e8741dbdad8c69d3ce10ea68 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 14 Nov 2024 17:47:09 +0100 Subject: [PATCH] add demo rules for Estonian (#157) * add rules for Estonian * test_rules.py: add newline * better rules and data * prepare merge * fix pickler --- simplemma/strategies/defaultrules/__init__.py | 2 + simplemma/strategies/defaultrules/et.py | 42 +++++++++++++++++++ tests/strategies/defaultrules/test_rules.py | 16 ++++++- 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 simplemma/strategies/defaultrules/et.py diff --git a/simplemma/strategies/defaultrules/__init__.py b/simplemma/strategies/defaultrules/__init__.py index 21d8bcc..2e72cb3 100644 --- a/simplemma/strategies/defaultrules/__init__.py +++ b/simplemma/strategies/defaultrules/__init__.py @@ -4,6 +4,7 @@ from .de import apply_de from .en import apply_en +from .et import apply_et from .fi import apply_fi from .lv import apply_lv from .nl import apply_nl @@ -13,6 +14,7 @@ DEFAULT_RULES: Dict[str, Callable[[str], Optional[str]]] = { "de": apply_de, "en": apply_en, + "et": apply_et, "fi": apply_fi, "lv": apply_lv, "nl": apply_nl, diff --git a/simplemma/strategies/defaultrules/et.py b/simplemma/strategies/defaultrules/et.py new file mode 100644 index 0000000..5cbe96a --- /dev/null +++ b/simplemma/strategies/defaultrules/et.py @@ -0,0 +1,42 @@ +import re +from typing import Optional + +from .generic import apply_rules + + +## Just a demo, the rules are really basic and coverage is not good + + +DEFAULT_RULES = { + # adjectives + # https://en.wiktionary.org/wiki/-line + re.compile( + r"(?:lise|list|lisse|lisesse|lises|lisest|lisele|lisel|liselt|liseks|liseni|lisena|liseta|lisega|lised|liste|lisi|listesse|lisisse|listes|lisis|listest|lisist|listele|lisile|listel|lisil|listelt|lisilt|listeks|lisiks|listeni|listena|listeta|listega)$" + ): "line", + # https://en.wiktionary.org/wiki/-mine + re.compile( + r"(?:mise|mist|misse|misesse|mises|misest|misele|misel|miselt|miseks|miseni|misena|miseta|misega|mised|miste|misi|mistesse|misisse|mistes|misis|mistest|misist|mistele|misile|mistel|misil|mistelt|misilt|misteks|misiks|misteni|mistena|misteta|mistega)$" + ): "mine", + # nouns + # https://en.wiktionary.org/wiki/-dus + re.compile( + r"(?:duse|dust|dusse|dusesse|duses|dusest|dusele|dusel|duselt|duseks|duseni|dusena|duseta|dusega|dused|duste|dusi|dustesse|dusisse|dustes|dusis|dustest|dusist|dustele|dusile|dustel|dusil|dustelt|dusilt|dusteks|dusiks|dusteni|dustena|dusteta|dustega)$" + ): "dus", + # https://en.wiktionary.org/wiki/-lik + # https://en.wiktionary.org/wiki/-nik + re.compile( + r"(?:iku|ikku|ikusse|ikus|ikust|ikule|ikul|ikult|ikuks|ikuni|ikuna|ikuta|ikuga|ikud|ike|ikudde|ikke|ikusid|ikesse|ikkudesse|ikes|ikkudes|ikest|ikkudest|ikele|ikkudele|ikel|ikkudel|ikelt|ikkudelt|ikeks|ikkudeks|ikeni|ikkudeni|ikena|ikkudena|iketa|ikkudeta|ikega|ikkudega)$" + ): "ik", + # https://en.wiktionary.org/wiki/-kond + re.compile( + r"(?:konna|konda|konnasse|konnas|konnast|konnale|konnal|konnalt|konnaks|konnani|konnana|konnata|konnaga|konnad|kondade|kondi|kondasid|kondadesse|konnisse|kondades|konnis|kondadest|konnist|kondadele|konnile|kondadel|konnil|kondadelt|konnilt|kondadeks|konniks|kondadeni|kondadena|kondadeta|kondadega)$" + ): "kond", +} + + +def apply_et(token: str) -> Optional[str]: + "Apply pre-defined rules for Estonian." + if len(token) < 8 or token[0].isupper(): + return None + + return apply_rules(token, DEFAULT_RULES) diff --git a/tests/strategies/defaultrules/test_rules.py b/tests/strategies/defaultrules/test_rules.py index 6da090a..4cf0868 100644 --- a/tests/strategies/defaultrules/test_rules.py +++ b/tests/strategies/defaultrules/test_rules.py @@ -4,14 +4,28 @@ def test_DEFAULT_RULES() -> None: """Test rules on all available languages.""" rules_strategy = RulesStrategy() + assert rules_strategy.get_lemma("Pfifferlinge", "de") == "Pfifferling" - assert rules_strategy.get_lemma("Pfifferlinge", "en") is None assert rules_strategy.get_lemma("atonements", "de") is None + assert rules_strategy.get_lemma("atonements", "en") == "atonement" + assert rules_strategy.get_lemma("Pfifferlinge", "en") is None + assert rules_strategy.get_lemma("brieven", "nl") == "brief" + assert rules_strategy.get_lemma("liikenaisessa", "fi") == "liikenainen" + assert rules_strategy.get_lemma("pracowaliście", "pl") == "pracować" + assert rules_strategy.get_lemma("безгра́мотностью", "ru") == "безгра́мотность" + assert rules_strategy.get_lemma("Rīga", "lv") is None assert rules_strategy.get_lemma("šķirkļiem", "lv") == "šķirklis" assert rules_strategy.get_lemma("mācībām", "lv") == "mācība" + + assert rules_strategy.get_lemma("Läänemere", "et") is None + assert rules_strategy.get_lemma("tavalised", "et") == "tavaline" + assert rules_strategy.get_lemma("peamisteks", "et") == "peamine" + assert rules_strategy.get_lemma("tähendustena", "et") == "tähendus" + assert rules_strategy.get_lemma("kunstnikud", "et") == "kunstnik" + assert rules_strategy.get_lemma("keelkondade", "et") == "keelkond"