-
Notifications
You must be signed in to change notification settings - Fork 274
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a24407d
commit 3ebf721
Showing
4 changed files
with
26,428 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# -*- coding: utf-8 -*- | ||
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project | ||
# SPDX-License-Identifier: Apache-2.0 | ||
""" | ||
Provides an optional word list from International Components for Unicode (ICU) dictionary. | ||
""" | ||
from typing import FrozenSet | ||
|
||
from pythainlp.corpus.common import get_corpus | ||
|
||
_THAI_ICU = None | ||
_THAI_ICU_FILENAME = "thai_icu.txt" | ||
|
||
def thai_icu() -> FrozenSet[str]: | ||
""" | ||
Return a frozenset of words from the International Components for Unicode (ICU) dictionary. | ||
The data is at pythainlp/corpus/thai_icu.txt | ||
The word list has beed prepared by the code at: | ||
https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/dictionaries/thaidict.txt | ||
:return: :class:`frozenset` containing words in the International Components for Unicode (ICU) dictionary. | ||
:rtype: :class:`frozenset` | ||
""" | ||
global _THAI_ICU | ||
if not _THAI_ICU: | ||
_THAI_ICU = get_corpus(_THAI_ICU_FILENAME) | ||
|
||
return _THAI_ICU |
Oops, something went wrong.