Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Seperate+Rename #863

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e39b622
Merge pull request #3 from PyThaiNLP/dev
konbraphat51 Nov 9, 2023
6ea4181
documentation
konbraphat51 Nov 9, 2023
be29c00
Add: implemation
konbraphat51 Nov 9, 2023
3e94234
Add: test code
konbraphat51 Nov 9, 2023
ca6cd94
Add: remove_repeat_consonants()
konbraphat51 Nov 9, 2023
181664c
Merge branch 'dev' of https://github.com/konbraphat51/pythainlp into dev
konbraphat51 Nov 9, 2023
702be9a
Fix: push miss
konbraphat51 Nov 9, 2023
130b1ec
Fix: divide the exceeding length code
konbraphat51 Nov 9, 2023
ef8ac0f
Refac: remove last white space
konbraphat51 Nov 9, 2023
2df4d37
Fix: restrict only to consonants
konbraphat51 Nov 9, 2023
16c3154
Refac: Remove unused import
konbraphat51 Nov 9, 2023
cc62a95
Refac: Use enumerate
konbraphat51 Nov 9, 2023
d74af32
Fix: add the function in init
konbraphat51 Nov 9, 2023
5bfa50d
Refac: use black
konbraphat51 Nov 9, 2023
28b6006
Refac: repeatedly used black
konbraphat51 Nov 9, 2023
c6b564d
Refac: resolve nested if
konbraphat51 Nov 10, 2023
8d09323
Fix test case
konbraphat51 Nov 10, 2023
946f59c
Refac: seperate function
konbraphat51 Nov 10, 2023
a5153e0
Refac: reduce line length
konbraphat51 Nov 10, 2023
43dfd25
Refac: seperate 2 functions
konbraphat51 Nov 10, 2023
d9ae534
Refac: use black
konbraphat51 Nov 10, 2023
844c21d
Refac: seperate match finding method
konbraphat51 Nov 10, 2023
1e1631f
Improve: save consonants repeaters for improve speed
konbraphat51 Nov 10, 2023
ceb9d76
Refac: make repeater checking function
konbraphat51 Nov 10, 2023
6509e0d
Refac: seperate function
konbraphat51 Nov 11, 2023
9c1a34c
Improve: Rename method
konbraphat51 Nov 11, 2023
24c3050
Refac: make names more clear
konbraphat51 Nov 11, 2023
13cf54a
Refac: reflect method name change
konbraphat51 Nov 11, 2023
a94fccb
Fix: argument inconsistence
konbraphat51 Nov 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
remove_zw,
reorder_vowels,
)
from pythainlp.util.remove_trailing_repeat_consonants import remove_trailing_repeat_consonants
from pythainlp.util.numtoword import bahttext, num_to_thaiword
from pythainlp.util.strftime import thai_strftime
from pythainlp.util.thai import (
Expand Down
4 changes: 1 addition & 3 deletions pythainlp/util/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,7 @@
]

# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
_NOREPEAT_CHARS = (
f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
)
_NOREPEAT_CHARS = f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
_NOREPEAT_PAIRS = list(
zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
)
Expand Down
252 changes: 252 additions & 0 deletions pythainlp/util/remove_trailing_repeat_consonants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Removement of repeated consonants
"""
from pythainlp.corpus import thai_words
from pythainlp.util.trie import Trie
from pythainlp import thai_consonants as consonants
from typing import Tuple, List

# used by remove_repeat_consonants()
# contains all words that has repeating consonants at the end
# for each consonant
# when dictionary updated, this should be updated too
# key: consonan
# value: list of words that has repeating consonants at the end
last_consonants_repeaters = {}


def remove_trailing_repeat_consonants(
text: str, dictionary: Trie = None, has_dictionary_updated: bool = True
) -> str:
"""
Remove repeating consonants at the last of the sentence.

This function will remove the repeating consonants
before a whitespace, new line or at the last
so that the last word matches a word in the given dictionary.
If there is no match, the repeating consonants will be
reduced to one.
If there are several match, the longest word will be used.
Since this function uses a dictionary, the result may differs
depending on the dictionary used.
Plus, it is recommended to use normalize() to have a better result.

:param str text: input text
:param Trie dictionary: Trie dictionary to check the last word.
If None, pythainlp.corpus.thai_words() will be used
:param bool has_dictionary_updated: If the dictionary is updated
or the first time using in the kernel, set this true.
If not, set this false to save time.
:return: text without repeating Thai consonants
:rtype: str

:Example:
::

from pythainlp.util import remove_trailing_repeat_consonants
from pythainlp.util import dict_trie

# use default dictionary (pythainlp.corpus.thai_words())
remove_trailing_repeat_consonants('เริ่ดดดดดดดด')
# output: เริ่ด

remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม')
# output: อืมมม
# "อืมมม" is in the default dictionary

# use custom dictionary
custom_dictionary = dict_trie(["อืมมมมม"])
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary)
# output: อืมมมมม

# long text
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\
'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ')
# output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ
# นี่เป็นความลับ
"""
# use default dictionary if not given
if dictionary is None:
dictionary = thai_words()

# update repeaters dictionary if not updated
if has_dictionary_updated:
_update_consonant_repeaters(dictionary)

# seperate by newline
modified_lines = []
for line in text.split("\n"):
segments = line.split(" ")

for cnt, segment in enumerate(segments):
segments[cnt] = _remove_repeat_trailing_consonants_from_segment(segment)

# revert spaces
modified_line = " ".join(segments)
modified_lines.append(modified_line)

# revert newlines
modified_text = "\n".join(modified_lines)

return modified_text


def _remove_repeat_trailing_consonants_from_segment(segment: str) -> str:
"""
Remove repeating consonants at the last of the segment.

This function process only at the last of the given text.
Details is same as remove_repeat_consonants().

:param str segment: segment of text
:return: segment without repeating Thai consonants
:rtype: str
"""
# skip if the segment is not the target
if not (
# the segment is long enough
(len(segment) > 1)
# last is Thai consonant
and (segment[-1] in consonants)
# has repiitition
and (segment[-1] == segment[-2])
):
# no need to process
return segment

# duplicating character
dup = segment[-1]

# find the words that has 2 or more duplication of
# this character at the end.
repeaters = last_consonants_repeaters[dup]

# remove all of the last repeating character
segment_head = _remove_all_last_consonants(segment, dup)

# find the longest word that matches the segment
longest_word, repetition = _find_longest_consonant_repeaters_match(
segment_head, repeaters
)

if len(longest_word) > 0:
# if there is a match, use it
segment = segment_head + (dup * repetition)
else:
# if none found,
# the chance is that the correct is one character,
# or it's not in the dictionary.

# make the repition to once
segment = segment_head + (dup * 1)

return segment


def _remove_all_last_consonants(text: str, dup: str) -> str:
"""
Reduce repeating characters at the end of the text.

This function will remove the repeating characters at the last.
The text just before the repeating characters will be returned.

:param str text: input text
:param str dup: repeating character to be removed
:return: text without repeating characters at the end
:rtype: str
"""
removed = text
while (len(removed) > 0) and (removed[-1] == dup):
removed = removed[:-1]

return removed


def _update_consonant_repeaters(dictionary: Trie) -> None:
"""
Update dictionary of all words that has
repeating consonants at the end from the dictionary.

Search all words in the dictionary that has more than 1 consonants
repeating at the end and store them in the global dictionary.

:param str consonant: consonant to be searched
:param Trie dictionary: Trie dictionary to search
:rtype: None
"""
# initialize dictionary
for consonant in list(consonants):
last_consonants_repeaters[consonant] = []

# register
for word in dictionary:
if _is_last_consonant_repeater(word):
last_consonants_repeaters[word[-1]].append(word)

return


def _is_last_consonant_repeater(word: str) -> bool:
"""
Check if the word has repeating consonants at the end.

This function checks if the word has
more than 1 repeating consonants at the end.

:param str word: word to be checked
:return: True if the word has repeating consonants at the end.
:rtype: bool
"""
return (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants)


def _find_longest_consonant_repeaters_match(
segment_head: str, repeaters: List[str]
) -> Tuple[str, int]:
"""
Find the longest word that matches the segment.

Find the longest word that matches the last
of the segment from the given repeaters list.
This returns the word and
how much the last character is repeated correctly.

:param str segment: segment of text
:param List[str] repeaters: list of words
that has repeating consonants at the end
:return: "tuple of the word" and
"how much the last character is repeated correctly"
If none, ("", 0) will be returned.
:rtype: Tuple[str, int]
"""
longest_word = "" # the longest word that matches the segment
repetition = 0 # how much the last character is repeated correctly
for repeater in repeaters:
# remove all of the last repeating character
repeater_head = _remove_all_last_consonants(repeater, repeater[-1])

# check match
if (
(len(segment_head) >= len(repeater_head))
and (segment_head[-len(repeater_head) :] == repeater_head)
# matched confirmed, check it's longer
and (len(repeater) > len(longest_word))
):
longest_word = repeater
repetition = len(repeater) - len(repeater_head)

return longest_word, repetition
41 changes: 35 additions & 6 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
ipa_to_rtgs,
remove_tone_ipa,
tis620_to_utf8,
remove_trailing_repeat_consonants
)
from pythainlp.util.spell_words import spell_word

Expand Down Expand Up @@ -832,7 +833,8 @@ def test_convert_years(self):
self.assertEqual(convert_years("242", src="re", target="ad"), "2023")
self.assertEqual(convert_years("242", src="re", target="ah"), "1444")
with self.assertRaises(NotImplementedError):
self.assertIsNotNone(convert_years("2023", src="cat", target="dog"))
self.assertIsNotNone(convert_years(
"2023", src="cat", target="dog"))

def test_nectec_to_ipa(self):
self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), 'kl uua j ˥˩')
Expand All @@ -846,17 +848,44 @@ def test_remove_tone_ipa(self):
self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj")

def test_tis620_to_utf8(self):
self.assertEqual(tis620_to_utf8("¡ÃзÃǧÍصÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")
self.assertEqual(tis620_to_utf8(
"¡ÃзÃǧÍصÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")

def test_spell_word(self):
self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ'])
self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน'])
self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])
self.assertEqual(spell_word("เสือ"), ['สอ', 'เอือ', 'เสือ'])
self.assertEqual(spell_word("เสื้อ"), ['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
self.assertEqual(spell_word("คน"), ['คอ', 'นอ', 'คน'])
self.assertEqual(spell_word("คนดี"), [
'คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])

def test_rhyme(self):
self.assertIsInstance(rhyme("แมว"), list)
self.assertTrue(len(rhyme("แมว")) > 2)

def test_remove_repeat_consonants(self):
# update of pythainlp.copus.thai_words() able to break this
self.assertEqual(
remove_trailing_repeat_consonants('เริ่ดดดดดดดด'),
'เริ่ด'
)
self.assertEqual(
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม'),
'อืมมม'
)

custom_dictionary = dict_trie(["อืมมมมม"])
self.assertEqual(
remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary),
'อืมมมมม'
)

self.assertEqual(
remove_trailing_repeat_consonants(
'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '
'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ'
),
'อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ'
)

# def test_abbreviation_to_full_text(self):
# self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))
Loading