Skip to content

Commit

Permalink
maintenance: simplify code
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Aug 8, 2024
1 parent 3e375f2 commit 88b0e8c
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 31 deletions.
12 changes: 5 additions & 7 deletions simplemma/strategies/defaultrules/de.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,10 @@ def apply_de(token: str) -> Optional[str]:
# nouns
if token[0].isupper():
# noun endings/suffixes: regex search
match = NOUN_ENDINGS_DE.search(token)
if match:
if match := NOUN_ENDINGS_DE.search(token):
# apply pattern
ending = next((g for g in match.groups() if g is not None), None)
if ending:
return token[: -len(ending)]
ending = next((g for g in match.groups() if g), None)
return token[: -len(ending)] if ending else token
# lemma identified
return token
# inclusive speech
Expand All @@ -50,8 +48,8 @@ def apply_de(token: str) -> Optional[str]:

# mostly adjectives and verbs
elif token[-1] in ENDING_CHARS_DE:
if ADJ_ENDINGS_DE.match(token):
return ADJ_ENDINGS_DE.sub(r"\1\2", token).lower()
if match := ADJ_ENDINGS_DE.match(token):
return (match[1] + match[2]).lower()
if PP_DE.search(token):
return ENDING_DE.sub("", token).lower()

Expand Down
3 changes: 1 addition & 2 deletions simplemma/strategies/dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ def get_lemma(self, token: str, lang: str) -> Optional[str]:
"""
# Search the language data, reverse case to extend coverage.
dictionary = self._dictionary_factory.get_dictionary(lang)
result = dictionary.get(token)
if result:
if result := dictionary.get(token):
return result
# Try upper or lowercase.
token = token.lower() if token[0].isupper() else token.capitalize()
Expand Down
16 changes: 7 additions & 9 deletions simplemma/strategies/greedy_dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,17 @@ def get_lemma(self, token: str, lang: str) -> str:
return token

dictionary = self._dictionary_factory.get_dictionary(lang)
candidate = token
for _ in range(self._steps):
if candidate not in dictionary:
break

new_candidate = dictionary[candidate]
for _ in range(self._steps):
candidate = dictionary.get(token)

if (
len(new_candidate) > len(candidate)
or levenshtein_dist(new_candidate, candidate) > self._distance
not candidate
or len(candidate) > len(token)
or levenshtein_dist(candidate, token) > self._distance
):
break

candidate = new_candidate
token = candidate

return candidate
return token
10 changes: 4 additions & 6 deletions simplemma/strategies/hyphen_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
from .lemmatization_strategy import LemmatizationStrategy

HYPHENS = {"-", "_"}
HYPHENS_FOR_REGEX = "".join(HYPHENS)
HYPHEN_REGEX = re.compile(rf"([{HYPHENS_FOR_REGEX}])")
HYPHEN_REGEX = re.compile(rf"([{''.join(HYPHENS)}])")


class HyphenRemovalStrategy(LemmatizationStrategy):
Expand Down Expand Up @@ -69,9 +68,8 @@ def get_lemma(self, token: str, lang: str) -> Optional[str]:
return lemma

# decompose
lemma = self._dictionary_lookup.get_lemma(token_parts[-1], lang)
if lemma is not None:
token_parts[-1] = lemma
return "".join(token_parts)
last_part_lemma = self._dictionary_lookup.get_lemma(token_parts[-1], lang)
if last_part_lemma is not None:
return "".join(token_parts[:-1] + [last_part_lemma])

return None
10 changes: 3 additions & 7 deletions simplemma/strategies/prefix_decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,11 @@ def get_lemma(self, token: str, lang: str) -> Optional[str]:
return None

prefix_match = self._known_prefixes[lang].match(token)
if not prefix_match:
if not prefix_match or prefix_match[1] == token:
return None
prefix = prefix_match[1]

if prefix == token:
return None
prefix = prefix_match[1]

subword = self._dictionary_lookup.get_lemma(token[len(prefix) :], lang)
if subword is None:
return None

return prefix + subword.lower()
return prefix + subword.lower() if subword else None

0 comments on commit 88b0e8c

Please sign in to comment.