Skip to content

Commit

Permalink
training: do not remove words tackled by rules
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed May 22, 2024
1 parent 39ff74d commit f605178
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions training/dictionary_pickler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
Functions used to created lemmatization dictionaries out of word lists.
Input format: lemma, tab, word, newline
Output format: pickled Python dictionary compressed with lzma.
"""
import lzma
import logging
import pickle
Expand Down Expand Up @@ -73,13 +78,12 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
continue
if len(columns[0]) > 6 and len(columns[1]) == 1:
continue
# tackled by rules
# print line if the rule is wrong
if (
len(columns[1]) > 6 and langcode in DEFAULT_RULES
): # columns[1] != columns[0]
len(columns[1]) > 6 and columns[1] != columns[0] and \
langcode in DEFAULT_RULES
):
rule = DEFAULT_RULES[langcode](columns[1])
if rule == columns[0]:
continue
if rule is not None and rule != columns[1]:
print(columns[1], columns[0], rule)
# process
Expand Down

0 comments on commit f605178

Please sign in to comment.