From 7e24ae0a791244596697be15b194566d07261955 Mon Sep 17 00:00:00 2001 From: Daniel Roschka Date: Tue, 5 Nov 2024 14:38:23 +0100 Subject: [PATCH 1/2] Fix data written by dictionary_pickler.py The changes to the return types of `DictionaryFactory.get_dictionary()` in 63933fc broke the generation of dictionaries using `training/dictionary_pickler.py`. This commit fixes that again, by undoing the changes to `training/dictionary_pickler.py` made by 63933fc. --- training/dictionary_pickler.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/training/dictionary_pickler.py b/training/dictionary_pickler.py index 69f4692..15345d1 100644 --- a/training/dictionary_pickler.py +++ b/training/dictionary_pickler.py @@ -10,7 +10,7 @@ import re from operator import itemgetter from pathlib import Path -from typing import Dict, List, Optional +from typing import ByteString, Dict, List, Optional import simplemma from simplemma.strategies.defaultrules import DEFAULT_RULES @@ -49,7 +49,9 @@ def _determine_path(listpath: str, langcode: str) -> str: return str(Path(__file__).parent / filename) -def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]: +def _read_dict( + filepath: str, langcode: str, silent: bool +) -> Dict[ByteString, ByteString]: mydict: Dict[str, str] = {} myadditions: List[str] = [] i: int = 0 @@ -120,12 +122,12 @@ def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]: mydict[word] = word LOGGER.debug("%s %s", langcode, i) # sort and convert to bytestrings - return dict(sorted(mydict.items())) + return {k.encode("utf-8"): v.encode("utf-8") for k, v in sorted(mydict.items())} def _load_dict( langcode: str, listpath: str = "lists", silent: bool = True -) -> Dict[str, str]: +) -> Dict[ByteString, ByteString]: filepath = _determine_path(listpath, langcode) return _read_dict(filepath, langcode, silent) From 7f1b2dfc2a5720da458b9ed7ab6165715466233d Mon Sep 17 00:00:00 2001 From: Daniel Roschka Date: Tue, 5 Nov 2024 14:55:17 +0100 Subject: [PATCH 2/2] Fix tests for dictionary_pickler --- tests/test_dictionary_pickler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dictionary_pickler.py b/tests/test_dictionary_pickler.py index 37136f2..2fc806f 100644 --- a/tests/test_dictionary_pickler.py +++ b/tests/test_dictionary_pickler.py @@ -26,9 +26,9 @@ def test_logic() -> None: # different order mydict = dictionary_pickler._read_dict(testfile, "es", silent=True) assert len(mydict) == 5 - assert mydict["closeones"] == "closeone" + assert mydict[b"closeones"] == b"closeone" item = sorted(mydict.keys(), reverse=True)[0] - assert item == "valid-word" + assert item == b"valid-word" # file I/O assert dictionary_pickler._determine_path("lists", "de").endswith("de.txt")