Skip to content

Commit

Permalink
training: do not remove words tackled by rules (#127)
Browse files Browse the repository at this point in the history
* training: do not remove words tackled by rules

* use black
  • Loading branch information
adbar authored May 22, 2024
1 parent 39ff74d commit c2cf6ae
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions training/dictionary_pickler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
"""
Functions used to created lemmatization dictionaries out of word lists.
Input format: lemma, tab, word, newline
Output format: pickled Python dictionary compressed with lzma.
"""

import lzma
import logging
import pickle
Expand Down Expand Up @@ -73,13 +79,13 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
continue
if len(columns[0]) > 6 and len(columns[1]) == 1:
continue
# tackled by rules
# print line if the rule is wrong
if (
len(columns[1]) > 6 and langcode in DEFAULT_RULES
): # columns[1] != columns[0]
len(columns[1]) > 6
and columns[1] != columns[0]
and langcode in DEFAULT_RULES
):
rule = DEFAULT_RULES[langcode](columns[1])
if rule == columns[0]:
continue
if rule is not None and rule != columns[1]:
print(columns[1], columns[0], rule)
# process
Expand Down

0 comments on commit c2cf6ae

Please sign in to comment.