Skip to content

Commit

Permalink
Add pythainlp.util.thai_to_idn
Browse files Browse the repository at this point in the history
  • Loading branch information
wannaphong committed Dec 3, 2023
1 parent f877567 commit d680c3b
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 1 deletion.
5 changes: 5 additions & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ Modules

The `thai_to_eng` function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context.

.. autofunction:: thai_to_idn
:noindex:

The `thai_to_idn` function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name.

.. autofunction:: thai_word_tone_detector
:noindex:

Expand Down
3 changes: 2 additions & 1 deletion pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"thai_strptime",
"thai_strftime",
"thai_to_eng",
"thai_to_idn",
"thai_word_tone_detector",
"thaiword_to_date",
"thaiword_to_num",
Expand Down Expand Up @@ -128,7 +129,7 @@
syllable_open_close_detector,
)
from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa
from pythainlp.util.encoding import tis620_to_utf8
from pythainlp.util.encoding import thai_to_idn, tis620_to_utf8
from pythainlp.util import spell_words
from pythainlp.util.abbreviation import abbreviation_to_full_text
from pythainlp.util.pronounce import rhyme
19 changes: 19 additions & 0 deletions pythainlp/util/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,22 @@ def tis620_to_utf8(text: str)->str:
# output: 'กระทรวงอุตสาหกรรม'
"""
return text.encode("cp1252", "ignore").decode("tis-620")


def thai_to_idn(text: str)->str:
"""
Convert Thai text to International Domain Name (IDN) for Thai domain name.
:param str text: Thai text
:return: Text that uses IDNA encoding
:rtype: str
:Example:
::
from pythainlp.util import thai_to_idn
thai_to_idn("คนละครึ่ง.com")
# output: 'xn--42caj4e6bk1f5b1j.com'
"""
return text.encode("idna").decode("utf-8")
4 changes: 4 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
thaiword_to_time,
time_to_thaiword,
thai_to_eng,
thai_to_idn,
thaiword_to_num,
thai_keyboard_dist,
text_to_num,
Expand Down Expand Up @@ -780,6 +781,9 @@ def test_syllable_open_close_detector(self):
self.assertEqual(syllable_open_close_detector("มาก"), "close")
self.assertEqual(syllable_open_close_detector("คะ"), "open")

def test_thai_to_idn(self):
self.assertEqual(thai_to_idn("คนละครึ่ง.com"), "xn--42caj4e6bk1f5b1j.com")

def test_thai_word_tone_detector(self):
self.assertIsNotNone(thai_word_tone_detector("คนดี"))
self.assertEqual(
Expand Down

0 comments on commit d680c3b

Please sign in to comment.