Skip to content

Commit

Permalink
add demo rules for Estonian (#157)
Browse files Browse the repository at this point in the history
* add rules for Estonian

* test_rules.py: add newline

* better rules and data

* prepare merge

* fix pickler
  • Loading branch information
adbar authored Nov 14, 2024
1 parent 7cf261d commit 436d4ed
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 1 deletion.
2 changes: 2 additions & 0 deletions simplemma/strategies/defaultrules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .de import apply_de
from .en import apply_en
from .et import apply_et
from .fi import apply_fi
from .lv import apply_lv
from .nl import apply_nl
Expand All @@ -13,6 +14,7 @@
DEFAULT_RULES: Dict[str, Callable[[str], Optional[str]]] = {
"de": apply_de,
"en": apply_en,
"et": apply_et,
"fi": apply_fi,
"lv": apply_lv,
"nl": apply_nl,
Expand Down
42 changes: 42 additions & 0 deletions simplemma/strategies/defaultrules/et.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import re
from typing import Optional

from .generic import apply_rules


## Just a demo, the rules are really basic and coverage is not good


DEFAULT_RULES = {
# adjectives
# https://en.wiktionary.org/wiki/-line
re.compile(
r"(?:lise|list|lisse|lisesse|lises|lisest|lisele|lisel|liselt|liseks|liseni|lisena|liseta|lisega|lised|liste|lisi|listesse|lisisse|listes|lisis|listest|lisist|listele|lisile|listel|lisil|listelt|lisilt|listeks|lisiks|listeni|listena|listeta|listega)$"
): "line",
# https://en.wiktionary.org/wiki/-mine
re.compile(
r"(?:mise|mist|misse|misesse|mises|misest|misele|misel|miselt|miseks|miseni|misena|miseta|misega|mised|miste|misi|mistesse|misisse|mistes|misis|mistest|misist|mistele|misile|mistel|misil|mistelt|misilt|misteks|misiks|misteni|mistena|misteta|mistega)$"
): "mine",
# nouns
# https://en.wiktionary.org/wiki/-dus
re.compile(
r"(?:duse|dust|dusse|dusesse|duses|dusest|dusele|dusel|duselt|duseks|duseni|dusena|duseta|dusega|dused|duste|dusi|dustesse|dusisse|dustes|dusis|dustest|dusist|dustele|dusile|dustel|dusil|dustelt|dusilt|dusteks|dusiks|dusteni|dustena|dusteta|dustega)$"
): "dus",
# https://en.wiktionary.org/wiki/-lik
# https://en.wiktionary.org/wiki/-nik
re.compile(
r"(?:iku|ikku|ikusse|ikus|ikust|ikule|ikul|ikult|ikuks|ikuni|ikuna|ikuta|ikuga|ikud|ike|ikudde|ikke|ikusid|ikesse|ikkudesse|ikes|ikkudes|ikest|ikkudest|ikele|ikkudele|ikel|ikkudel|ikelt|ikkudelt|ikeks|ikkudeks|ikeni|ikkudeni|ikena|ikkudena|iketa|ikkudeta|ikega|ikkudega)$"
): "ik",
# https://en.wiktionary.org/wiki/-kond
re.compile(
r"(?:konna|konda|konnasse|konnas|konnast|konnale|konnal|konnalt|konnaks|konnani|konnana|konnata|konnaga|konnad|kondade|kondi|kondasid|kondadesse|konnisse|kondades|konnis|kondadest|konnist|kondadele|konnile|kondadel|konnil|kondadelt|konnilt|kondadeks|konniks|kondadeni|kondadena|kondadeta|kondadega)$"
): "kond",
}


def apply_et(token: str) -> Optional[str]:
"Apply pre-defined rules for Estonian."
if len(token) < 8 or token[0].isupper():
return None

return apply_rules(token, DEFAULT_RULES)
16 changes: 15 additions & 1 deletion tests/strategies/defaultrules/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,28 @@
def test_DEFAULT_RULES() -> None:
"""Test rules on all available languages."""
rules_strategy = RulesStrategy()

assert rules_strategy.get_lemma("Pfifferlinge", "de") == "Pfifferling"
assert rules_strategy.get_lemma("Pfifferlinge", "en") is None
assert rules_strategy.get_lemma("atonements", "de") is None

assert rules_strategy.get_lemma("atonements", "en") == "atonement"
assert rules_strategy.get_lemma("Pfifferlinge", "en") is None

assert rules_strategy.get_lemma("brieven", "nl") == "brief"

assert rules_strategy.get_lemma("liikenaisessa", "fi") == "liikenainen"

assert rules_strategy.get_lemma("pracowaliście", "pl") == "pracować"

assert rules_strategy.get_lemma("безгра́мотностью", "ru") == "безгра́мотность"

assert rules_strategy.get_lemma("Rīga", "lv") is None
assert rules_strategy.get_lemma("šķirkļiem", "lv") == "šķirklis"
assert rules_strategy.get_lemma("mācībām", "lv") == "mācība"

assert rules_strategy.get_lemma("Läänemere", "et") is None
assert rules_strategy.get_lemma("tavalised", "et") == "tavaline"
assert rules_strategy.get_lemma("peamisteks", "et") == "peamine"
assert rules_strategy.get_lemma("tähendustena", "et") == "tähendus"
assert rules_strategy.get_lemma("kunstnikud", "et") == "kunstnik"
assert rules_strategy.get_lemma("keelkondade", "et") == "keelkond"

0 comments on commit 436d4ed

Please sign in to comment.