Skip to content

Commit

Permalink
add thai ICU corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
pavaris-pm committed Dec 5, 2023
1 parent a24407d commit 3ebf721
Show file tree
Hide file tree
Showing 4 changed files with 26,428 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"thai_dict",
"thai_family_names",
"thai_female_names",
"thai_icu",
"thai_male_names",
"thai_negations",
"thai_orst_words",
Expand Down Expand Up @@ -110,5 +111,6 @@ def corpus_db_path() -> str:
thai_words,
thai_wsd_dict,
)
from pythainlp.corpus.thai_icu import thai_icu
from pythainlp.corpus.volubilis import volubilis
from pythainlp.corpus.wikipedia_titles import wikipedia_titles
29 changes: 29 additions & 0 deletions pythainlp/corpus/thai_icu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Provides an optional word list from International Components for Unicode (ICU) dictionary.
"""
from typing import FrozenSet

from pythainlp.corpus.common import get_corpus

_THAI_ICU = None
_THAI_ICU_FILENAME = "thai_icu.txt"

def thai_icu() -> FrozenSet[str]:
"""
Return a frozenset of words from the International Components for Unicode (ICU) dictionary.
The data is at pythainlp/corpus/thai_icu.txt
The word list has beed prepared by the code at:
https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/dictionaries/thaidict.txt
:return: :class:`frozenset` containing words in the International Components for Unicode (ICU) dictionary.
:rtype: :class:`frozenset`
"""
global _THAI_ICU
if not _THAI_ICU:
_THAI_ICU = get_corpus(_THAI_ICU_FILENAME)

return _THAI_ICU
Loading

0 comments on commit 3ebf721

Please sign in to comment.