Skip to content

Commit

Permalink
add rules for Estonian
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 5, 2024
1 parent db5f5d8 commit ce6f0ca
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 1 deletion.
2 changes: 2 additions & 0 deletions simplemma/strategies/defaultrules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .de import apply_de
from .en import apply_en
from .et import apply_et
from .fi import apply_fi
from .nl import apply_nl
from .pl import apply_pl
Expand All @@ -12,6 +13,7 @@
DEFAULT_RULES: Dict[str, Callable[[str], Optional[str]]] = {
"de": apply_de,
"en": apply_en,
"et": apply_et,
"fi": apply_fi,
"nl": apply_nl,
"pl": apply_pl,
Expand Down
29 changes: 29 additions & 0 deletions simplemma/strategies/defaultrules/et.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import re
from typing import Optional

from .generic import apply_rules

DEFAULT_RULES = {
# adjectives
# https://en.wiktionary.org/wiki/-line
re.compile(
r"(?:lise|list|lisse|lisesse|lises|lisest|lisele|lisel|liselt|liseks|liseni|lisena|liseta|lisega|lised|liste|lisi|listesse|lisisse|listes|lisis|listest|lisist|listele|lisile|listel|lisil|listelt|lisilt|listeks|lisiks|listeni|listena|listeta|listega)$"
): "line",
# nouns
# https://en.wiktionary.org/wiki/-kond
re.compile(
r"(?:konna|konda|konnasse|konnas|konnast|konnale|konnal|konnalt|konnaks|konnani|konnana|konnata|konnaga|konnad|kondade|kondi|kondasid|kondadesse|konnisse|kondades|konnis|kondadest|konnist|kondadele|konnile|kondadel|konnil|kondadelt|konnilt|kondadeks|konniks|kondadeni|kondadena|kondadeta|kondadega)$"
): "kond",
# https://en.wiktionary.org/wiki/-nik
re.compile(
r"(?:niku|nikku|nikusse|nikus|nikust|nikule|nikul|nikult|nikuks|nikuni|nikuna|nikuta|nikuga|nikud|nike|nikudde|nikke|nikusid|nikesse|nikkudesse|nikes|nikkudes|nikest|nikkudest|nikele|nikkudele|nikel|nikkudel|nikelt|nikkudelt|nikeks|nikkudeks|nikeni|nikkudeni|nikena|nikkudena|niketa|nikkudeta|nikega|nikkudega)$"
): "nik",
}


def apply_et(token: str) -> Optional[str]:
"Apply pre-defined rules for Estonian."
if len(token) < 10 or token[0].isupper():
return None

return apply_rules(token, DEFAULT_RULES)
10 changes: 9 additions & 1 deletion tests/strategies/defaultrules/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,19 @@
def test_DEFAULT_RULES() -> None:
"""Test rules on all available languages."""
rules_strategy = RulesStrategy()

assert rules_strategy.get_lemma("Pfifferlinge", "de") == "Pfifferling"
assert rules_strategy.get_lemma("Pfifferlinge", "en") is None
assert rules_strategy.get_lemma("atonements", "de") is None

assert rules_strategy.get_lemma("atonements", "en") == "atonement"
assert rules_strategy.get_lemma("Pfifferlinge", "en") is None

assert rules_strategy.get_lemma("brieven", "nl") == "brief"

assert rules_strategy.get_lemma("liikenaisessa", "fi") == "liikenainen"

assert rules_strategy.get_lemma("pracowaliście", "pl") == "pracować"

assert rules_strategy.get_lemma("безгра́мотностью", "ru") == "безгра́мотность"

assert rules_strategy.get_lemma("keelkondade", "et") == "keelkond"

0 comments on commit ce6f0ca

Please sign in to comment.