adbar · adbar · Jun 26, 2024 · May 30, 2024 · Jun 14, 2024 · osma
diff --git a/README.md b/README.md
@@ -259,6 +259,61 @@ LANG_CACHE_SIZE = 5  # How many language dictionaries to keep in memory at once
 For more information see the
 [extended documentation](https://adbar.github.io/simplemma/).
 
+### Reducing memory usage
+
+For situations where low memory usage and fast initialization time are
+more important than lemmatization and language detection performance,
+Simplemma ships another `DictionaryFactory`, which uses a trie as
+underlying data structure instead of a Python dict.
+
+Using the `TrieDictionaryFactory` reduces memory usage on average by
+20x and initialization time by 100x, but comes at the cost that
+performance can be down 50% or even more compared to what Simplemma
+otherwise achieves, depending on the specific usage.
+
+To use the `TrieDictionaryFactory` you have to install Simplemma with
+the `marisa-trie` extra dependency:
+
+```
+pip install simplemma[marisa-trie]
+```
+
+Then you have to create a custom strategy using the
+`TrieDictionaryFactory` and use that for `Lemmatizer` and
+`LanguageDetector` instances:
+
+``` python
+>>> from simplemma import LanguageDetector, Lemmatizer
+>>> from simplemma.strategies import DefaultStrategy
+>>> from simplemma.strategies.dictionaries import TrieDictionaryFactory
+
+>>> lemmatization_strategy = DefaultStrategy(dictionary_factory=TrieDictionaryFactory())
+
+>>> lemmatizer = Lemmatizer(lemmatization_strategy=lemmatization_strategy)
+>>> lemmatizer.lemmatize('doughnuts', lang='en')
+'doughnut'
+
+>>> language_detector = LanguageDetector('la', lemmatization_strategy=lemmatization_strategy)
+>>> language_detector.proportion_in_target_languages("opera post physica posita (τὰ μετὰ τὰ φυσικά)")
+0.5
+```
+
+While memory usage and initialization time when using the
+`TrieDictionaryFactory` are significantly lower compared to the
+`DefaultDictionaryFactory`, that's only true if the trie dictionaries
+are available on disk. That's not the case when using the
+`TrieDictionaryFactory` for the first time, as Simplemma only ships
+the dictionaries as Python dicts. The trie dictionaries have to be
+generated once from the Python dicts. That happens on-the-fly when
+using the `TrieDictionaryFactory` for the first time for a language and
+will take a few seconds and use as much memory as loading the Python
+dicts for the language requires. For further invocations the trie
+dictionaries get cached on disk.
+
+If the computer supposed to run Simplemma doesn't have enough memory to
+generate the trie dictionaries, they can also be generated on another
+computer with the same CPU architecture and copied over to the cache
+directory.
 
 ## Supported languages
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,6 +1,8 @@
 black==24.4.2
 flake8==7.0.0
+marisa_trie==1.2.0
 mypy==1.10.0
+platformdirs==4.2.2
 pytest==8.2.1
 pytest-cov==5.0.0
 types-requests==2.32.0.20240523
diff --git a/setup.py b/setup.py
@@ -85,6 +85,7 @@ def get_version(package):
     ],
     description="A simple multilingual lemmatizer for Python.",
     install_requires=requirements,
+    extras_require={"marisa-trie": ["marisa-trie", "platformdirs"]},
     license="MIT license",
     long_description=readme,  # + '\n\n' + history,
     long_description_content_type="text/markdown",

diff --git a/simplemma/strategies/__init__.py b/simplemma/strategies/__init__.py
@@ -2,7 +2,11 @@
 
 from .affix_decomposition import AffixDecompositionStrategy
 from .default import DefaultStrategy
-from .dictionaries import DefaultDictionaryFactory, DictionaryFactory
+from .dictionaries import (
+    DefaultDictionaryFactory,
+    DictionaryFactory,
+    TrieDictionaryFactory,
+)
 from .dictionary_lookup import DictionaryLookupStrategy
 from .fallback.lemmatization_fallback_strategy import LemmatizationFallbackStrategy
 from .fallback.raise_error import RaiseErrorFallbackStrategy

diff --git a/simplemma/strategies/dictionaries/__init__.py b/simplemma/strategies/dictionaries/__init__.py
@@ -1,3 +1,4 @@
 """Dictionary-based lemmatization strategy."""
 
 from .dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
+from .trie_directory_factory import TrieDictionaryFactory
diff --git a/simplemma/strategies/dictionaries/dictionary_factory.py b/simplemma/strategies/dictionaries/dictionary_factory.py
@@ -14,7 +14,7 @@
 from functools import lru_cache
 from os import listdir, path
 from pathlib import Path
-from typing import ByteString, Dict, Protocol
+from typing import ByteString, Dict, Mapping, Protocol
 
 DATA_FOLDER = str(Path(__file__).parent / "data")
 SUPPORTED_LANGUAGES = [
@@ -62,22 +62,41 @@
     def get_dictionary(
         self,
         lang: str,
-    ) -> Dict[ByteString, ByteString]:
+    ) -> Mapping[str, str]:
         """
         Get the dictionary for a specific language.
 
         Args:
             lang (str): The language code.
 
         Returns:
-            Dict[str, str]: The dictionary for the specified language.
+            Mapping[str, str]: The dictionary for the specified language.
 
         Raises:
             ValueError: If the specified language is not supported.
         """
         raise NotImplementedError
 
 
+class MappingStrToByteString(Mapping[str, str]):
+    """Wrapper around ByString dict to make them behave like str dict."""
+
+    __slots__ = ["_dict"]
+
+    def __init__(self, dictionary: Dict[bytes, bytes]):
+        self._dict = dictionary
+
+    def __getitem__(self, item: str):
+        return self._dict[item.encode()].decode()
+
+    def __iter__(self):
+        for key in self._dict:
+            yield key.decode()
+
+    def __len__(self):
+        return len(self._dict)
+
+
 class DefaultDictionaryFactory(DictionaryFactory):
     """
     Default Dictionary Factory.
@@ -86,7 +105,7 @@
     It provides functionality for loading and caching dictionaries from disk that are included in Simplemma.
     """
 
-    __slots__ = ["_data", "_load_dictionary_from_disk"]
+    __slots__ = ["_load_dictionary_from_disk"]
 
     def __init__(self, cache_max_size: int = 8):
         """
@@ -96,27 +115,26 @@
             cache_max_size (int): The maximum size of the cache for loaded dictionaries.
                 Defaults to `8`.
         """
-        self._data: Dict[str, Dict[ByteString, ByteString]] = {}
         self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)(
             _load_dictionary_from_disk
         )
 
     def get_dictionary(
         self,
         lang: str,
-    ) -> Dict[ByteString, ByteString]:
+    ) -> Mapping[str, str]:
         """
         Get the dictionary for a specific language.
 
         Args:
             lang (str): The language code.
 
         Returns:
-            Dict[str, str]: The dictionary for the specified language.
+            Mapping[str, str]: The dictionary for the specified language.
 
         Raises:
             ValueError: If the specified language is not supported.
         """
         if lang not in SUPPORTED_LANGUAGES:
             raise ValueError(f"Unsupported language: {lang}")
-        return self._load_dictionary_from_disk(lang)
+        return MappingStrToByteString(self._load_dictionary_from_disk(lang))
diff --git a/simplemma/strategies/dictionaries/trie_directory_factory.py b/simplemma/strategies/dictionaries/trie_directory_factory.py
@@ -0,0 +1,123 @@
+import logging
+from collections.abc import MutableMapping
+from functools import lru_cache
+from pathlib import Path
+from typing import List, Mapping, Optional
+
+from marisa_trie import BytesTrie, HUGE_CACHE  # type: ignore[import-not-found]
+from platformdirs import user_cache_dir
+
+from simplemma import __version__ as SIMPLEMMA_VERSION
+from simplemma.strategies.dictionaries.dictionary_factory import (
+    DefaultDictionaryFactory,
+    DictionaryFactory,
+    SUPPORTED_LANGUAGES,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class TrieWrapDict(MutableMapping):
+    """Wrapper around BytesTrie to make them behave like dicts."""
+
+    def __init__(self, trie: BytesTrie):
+        self._trie = trie
+
+    def __getitem__(self, item):
+        return self._trie[item][0].decode()
+
+    def __setitem__(self, key, value):
+        raise NotImplementedError
+
+    def __delitem__(self, key):
+        raise NotImplementedError
+
+    def __iter__(self):
+        for key in self._trie.iterkeys():
+            yield key
+
+    def __len__(self):
+        return len(self._trie)
+
+
+class TrieDictionaryFactory(DictionaryFactory):
+    """Memory optimized DictionaryFactory backed by MARISA-tries.
+
+    This dictionary factory creates dictionaries, which are backed by a
+    MARISA-trie instead of a dict, to make them consume very little
+    memory compared to the DefaultDictionaryFactory. Trade-offs are that
+    lookup performance isn't as good as with dicts.
+    """
+
+    __slots__: List[str] = []
+
+    def __init__(
+        self,
+        cache_max_size: int = 8,
+        use_disk_cache: bool = True,
+        disk_cache_dir: Optional[str] = None,
+    ) -> None:
+        """Initialize the TrieDictionaryFactory.
+
+        Args:
+            cache_max_size (int): The maximum number dictionaries to
+                keep in memory. Defaults to `8`.
+            use_disk_cache (bool): Whether to cache the tries on disk to
+                speed up loading time. Defaults to `True`.
+            disk_cache_dir (Optional[str]): Path where the generated
+                tries should be stored in. Defaults to a Simplemma-
+                specific subdirectory of the user's cache directory.
+        """
+
+        if disk_cache_dir:
+            self._cache_dir = Path(disk_cache_dir)
+        else:
+            self._cache_dir = (
+                Path(user_cache_dir("simplemma")) / "marisa_trie" / SIMPLEMMA_VERSION
+            )
+        self._use_disk_cache = use_disk_cache
+        self._get_dictionary = lru_cache(maxsize=cache_max_size)(
+            self._get_dictionary_uncached
+        )
+
+    def _create_trie_from_pickled_dict(self, lang: str) -> BytesTrie:
+        """Create a trie from a pickled dictionary."""
+        unpickled_dict = DefaultDictionaryFactory(cache_max_size=0).get_dictionary(lang)
+        return BytesTrie(
+            zip(
+                unpickled_dict.keys(),
+                [value.encode() for value in unpickled_dict.values()],
+            ),
+            cache_size=HUGE_CACHE,
+        )
+
+    def _write_trie_to_disk(self, lang: str, trie: BytesTrie) -> None:
+        """Persist the trie to disk for later usage.
+
+        The persisted trie can be loaded by subsequent runs to speed up
+        loading times.
+        """
+        logger.debug("Caching trie on disk. This might take a second.")
+        self._cache_dir.mkdir(parents=True, exist_ok=True)
+
+        trie.save(self._cache_dir / f"{lang}.dic")
+
+    def _get_dictionary_uncached(self, lang: str) -> Mapping[str, str]:
+        """Get the dictionary for the given language."""
+        if lang not in SUPPORTED_LANGUAGES:
+            raise ValueError(f"Unsupported language: {lang}")
+
+        if self._use_disk_cache and (self._cache_dir / f"{lang}.dic").exists():
+            trie = BytesTrie().load(str(self._cache_dir / f"{lang}.dic"))
+        else:
+            trie = self._create_trie_from_pickled_dict(lang)
+            if self._use_disk_cache:
+                self._write_trie_to_disk(lang, trie)
+
+        return TrieWrapDict(trie)
+
+    def get_dictionary(
+        self,
+        lang: str,
+    ) -> Mapping[str, str]:
+        return self._get_dictionary(lang)
diff --git a/simplemma/strategies/dictionary_lookup.py b/simplemma/strategies/dictionary_lookup.py
@@ -3,7 +3,7 @@
 It provides lemmatization using dictionary lookup.
 """
 
-from typing import ByteString, Dict, Optional
+from typing import Optional
 
 from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory
 from .lemmatization_strategy import LemmatizationStrategy
@@ -26,13 +26,6 @@ def __init__(
         """
         self._dictionary_factory = dictionary_factory
 
-    def _get(
-        self, token: str, dictionary: Dict[ByteString, ByteString]
-    ) -> Optional[str]:
-        "Convenience function to handle bytestring to string conversion."
-        result = dictionary.get(token.encode("utf-8"))
-        return result.decode("utf-8") if result else None  # type: ignore[union-attr]
-
     def get_lemma(self, token: str, lang: str) -> Optional[str]:
         """
         Get Lemma using Dictionary Lookup
@@ -50,9 +43,9 @@ def get_lemma(self, token: str, lang: str) -> Optional[str]:
         """
         # Search the language data, reverse case to extend coverage.
         dictionary = self._dictionary_factory.get_dictionary(lang)
-        result = self._get(token, dictionary)
+        result = dictionary.get(token)
         if result:
             return result
         # Try upper or lowercase.
         token = token.lower() if token[0].isupper() else token.capitalize()
-        return self._get(token, dictionary)
+        return dictionary.get(token)
diff --git a/simplemma/strategies/greedy_dictionary_lookup.py b/simplemma/strategies/greedy_dictionary_lookup.py
@@ -58,7 +58,7 @@ def get_lemma(self, token: str, lang: str) -> str:
             return token
 
         dictionary = self._dictionary_factory.get_dictionary(lang)
-        candidate = token.encode("utf-8")
+        candidate = token
         for _ in range(self._steps):
             if candidate not in dictionary:
                 break
@@ -73,4 +73,4 @@ def get_lemma(self, token: str, lang: str) -> str:
 
             candidate = new_candidate
 
-        return candidate.decode("utf-8")
+        return candidate
diff --git a/simplemma/utils.py b/simplemma/utils.py
@@ -6,7 +6,7 @@
 - [validate_lang_input][simplemma.utils.validate_lang_input]: Validates the language input and ensures it is a valid tuple.
 """
 
-from typing import ByteString, Tuple, Union
+from typing import Tuple, Union
 
 
 def validate_lang_input(lang: Union[str, Tuple[str, ...]]) -> Tuple[str]:
@@ -31,9 +31,7 @@ def validate_lang_input(lang: Union[str, Tuple[str, ...]]) -> Tuple[str]:
     return lang  # type: ignore[return-value]
 
 
-def levenshtein_dist(
-    first: Union[ByteString, str], second: Union[ByteString, str]
-) -> int:
+def levenshtein_dist(str1: str, str2: str) -> int:
     """
     Calculate the Levenshtein distance between two strings.
 
@@ -49,8 +47,6 @@ def levenshtein_dist(
         int: The Levenshtein distance between the two strings.
 
     """
-    str1 = first.encode("utf-8") if isinstance(first, str) else first
-    str2 = second.encode("utf-8") if isinstance(second, str) else second
     # inspired by this noticeably faster code:
     # https://gist.github.com/p-hash/9e0f9904ce7947c133308fbe48fe032b
     if str1 == str2: