From d680c3b0484cf3b0df6de2b4aee0a0ece93a8322 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 3 Dec 2023 19:01:23 +0700 Subject: [PATCH] Add pythainlp.util.thai_to_idn --- docs/api/util.rst | 5 +++++ pythainlp/util/__init__.py | 3 ++- pythainlp/util/encoding.py | 19 +++++++++++++++++++ tests/test_util.py | 4 ++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index 41b635d93..401169feb 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -218,6 +218,11 @@ Modules The `thai_to_eng` function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context. +.. autofunction:: thai_to_idn + :noindex: + + The `thai_to_idn` function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name. + .. autofunction:: thai_word_tone_detector :noindex: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 55302507b..5ac977ade 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -62,6 +62,7 @@ "thai_strptime", "thai_strftime", "thai_to_eng", + "thai_to_idn", "thai_word_tone_detector", "thaiword_to_date", "thaiword_to_num", @@ -128,7 +129,7 @@ syllable_open_close_detector, ) from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa -from pythainlp.util.encoding import tis620_to_utf8 +from pythainlp.util.encoding import thai_to_idn, tis620_to_utf8 from pythainlp.util import spell_words from pythainlp.util.abbreviation import abbreviation_to_full_text from pythainlp.util.pronounce import rhyme diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index 91f18f411..3853dbaac 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -29,3 +29,22 @@ def tis620_to_utf8(text: str)->str: # output: 'กระทรวงอุตสาหกรรม' """ return text.encode("cp1252", "ignore").decode("tis-620") + + +def thai_to_idn(text: str)->str: + """ + Convert Thai text to International Domain Name (IDN) for Thai domain name. + + :param str text: Thai text + :return: Text that uses IDNA encoding + :rtype: str + + :Example: + :: + + from pythainlp.util import thai_to_idn + + thai_to_idn("คนละครึ่ง.com") + # output: 'xn--42caj4e6bk1f5b1j.com' + """ + return text.encode("idna").decode("utf-8") diff --git a/tests/test_util.py b/tests/test_util.py index e45319c99..95cf0c87f 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -45,6 +45,7 @@ thaiword_to_time, time_to_thaiword, thai_to_eng, + thai_to_idn, thaiword_to_num, thai_keyboard_dist, text_to_num, @@ -780,6 +781,9 @@ def test_syllable_open_close_detector(self): self.assertEqual(syllable_open_close_detector("มาก"), "close") self.assertEqual(syllable_open_close_detector("คะ"), "open") + def test_thai_to_idn(self): + self.assertEqual(thai_to_idn("คนละครึ่ง.com"), "xn--42caj4e6bk1f5b1j.com") + def test_thai_word_tone_detector(self): self.assertIsNotNone(thai_word_tone_detector("คนดี")) self.assertEqual(