diff --git a/docs/api/util.rst b/docs/api/util.rst index 41b635d93..bb7efbfd3 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -218,6 +218,11 @@ Modules The `thai_to_eng` function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context. +.. autofunction:: to_idna + :noindex: + + The `to_idna` function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name. + .. autofunction:: thai_word_tone_detector :noindex: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index d25adb8ca..520d17f70 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -51,6 +51,7 @@ "thai_strptime", "thai_strftime", "thai_to_eng", + "to_idna", "thai_word_tone_detector", "thaiword_to_date", "thaiword_to_num", @@ -117,7 +118,7 @@ syllable_open_close_detector, ) from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa -from pythainlp.util.encoding import tis620_to_utf8 +from pythainlp.util.encoding import to_idna, tis620_to_utf8 from pythainlp.util import spell_words from pythainlp.util.abbreviation import abbreviation_to_full_text from pythainlp.util.pronounce import rhyme diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index 5aa92723c..84883ec34 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -5,8 +5,8 @@ def tis620_to_utf8(text: str)->str: """ Convert TIS-620 to UTF-8 - :param str text: Text that uses TIS-620 encoding - :return: Text that uses UTF-8 encoding + :param str text: TIS-620 encoded text + :return: UTF-8 encoded text :rtype: str :Example: @@ -18,3 +18,22 @@ def tis620_to_utf8(text: str)->str: # output: 'กระทรวงอุตสาหกรรม' """ return text.encode("cp1252", "ignore").decode("tis-620") + + +def to_idna(text: str) -> str: + """ + Encode text with IDNA, as used in Internationalized Domain Name (IDN). + + :param str text: Thai text + :return: IDNA-encoded text + :rtype: str + + :Example: + :: + + from pythainlp.util import to_idna + + to_idna("คนละครึ่ง.com") + # output: 'xn--42caj4e6bk1f5b1j.com' + """ + return text.encode("idna").decode("utf-8") diff --git a/tests/test_util.py b/tests/test_util.py index e45319c99..ee02a278c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -45,6 +45,7 @@ thaiword_to_time, time_to_thaiword, thai_to_eng, + to_idna, thaiword_to_num, thai_keyboard_dist, text_to_num, @@ -780,6 +781,9 @@ def test_syllable_open_close_detector(self): self.assertEqual(syllable_open_close_detector("มาก"), "close") self.assertEqual(syllable_open_close_detector("คะ"), "open") + def test_to_idna(self): + self.assertEqual(to_idna("คนละครึ่ง.com"), "xn--42caj4e6bk1f5b1j.com") + def test_thai_word_tone_detector(self): self.assertIsNotNone(thai_word_tone_detector("คนดี")) self.assertEqual(