Skip to content

Commit

Permalink
update pep8
Browse files Browse the repository at this point in the history
  • Loading branch information
pavaris-pm committed Dec 10, 2023
1 parent 536f493 commit 76b49c3
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 35 deletions.
4 changes: 1 addition & 3 deletions pythainlp/augment/lm/phayathaibert.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,11 @@ def generate(self,
gen_txt = re.sub("<mask>", "", final_text)
return gen_txt


def augment(self,
text: str,
num_augs: int = 3,
sample: bool = False
)->List[str]:
) -> List[str]:
"""
Text Augment from phayathaibert
Expand Down Expand Up @@ -86,4 +85,3 @@ def augment(self,
raise ValueError(
f"augmentation of more than {num_augs} is exceeded the default limit"
)

2 changes: 1 addition & 1 deletion pythainlp/phayathaibert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
NamedEntityTagger,
PartOfSpeechTagger,
segment,
)
)
57 changes: 33 additions & 24 deletions pythainlp/phayathaibert/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def __init__(self):
"<unk> <rep> <wrep> <url> </s>".split()
self.SPACE_SPECIAL_TOKEN = "<_>"


def replace_url(self, text: str) -> str:
"""
Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965)
Expand Down Expand Up @@ -191,15 +190,18 @@ def __init__(self) -> None:
pipeline,)
self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name)
self.model = pipeline("fill-mask", tokenizer = self.tokenizer, model = self.model_for_masked_lm)
self.model = pipeline("fill-mask",
tokenizer = self.tokenizer,
model = self.model_for_masked_lm,
)
self.processor = ThaiTextProcessor()

def generate(self,
sample_text: str,
word_rank: int,
sample_text: str,
word_rank: int,
max_length: int = 3,
sample: bool = False,
)->str:
) -> str:
sample_txt = sample_text
final_text = ""
for j in range(max_length):
Expand All @@ -214,13 +216,12 @@ def generate(self,

gen_txt = re.sub("<mask>", "", final_text)
return gen_txt


def augment(self,
text: str,
num_augs: int = 3,
text: str,
num_augs: int = 3,
sample: bool = False,
)->List[str]:
) -> List[str]:
"""
Text Augment from phayathaibert
Expand Down Expand Up @@ -248,9 +249,12 @@ def augment(self,
'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
"""
augment_list = []
if num_augs <= 5:
if num_augs <= 5:
for rank in range(num_augs):
gen_text = self.generate(text, rank, sample = sample)
gen_text = self.generate(text,
rank,
sample = sample,
)
processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
augment_list.append(processed_text)

Expand Down Expand Up @@ -296,20 +300,22 @@ def get_tag(self,
outputs = pipeline(sentence)
word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
return word_tags



class NamedEntityTagger:
def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None:
def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None:
from transformers import (AutoTokenizer,
AutoModelForTokenClassification,
)
self.tokenizer = AutoTokenizer.from_pretrained(model)
self.model = AutoModelForTokenClassification.from_pretrained(model)
def get_ner(self,
text: str,
tag: bool = False,
pos: bool = False,
strategy: str = "simple",
)->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:

def get_ner(self,
text: str,
tag: bool = False,
pos: bool = False,
strategy: str = "simple",
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
"""
This function tags named entities in text in IOB format.
Expand All @@ -333,11 +339,13 @@ def get_ner(self,
('จาก', 'LOCATION'),
('ประเทศไทย', 'LOCATION')]
>>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True)
'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON><LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON>\
<LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
"""
from transformers import TokenClassificationPipeline
if pos:
warnings.warn("This model doesn't support output postag and It doesn't output the postag.")
warnings.warn("This model doesn't support output \
postag and It doesn't output the postag.")
sample_output = []
tag_text_list = []
current_pos = 0
Expand All @@ -363,10 +371,11 @@ def get_ner(self,
else:
return sample_output


def segment(sentence: str) -> List[str]:
"""
Subword tokenize of phayathaibert, sentencepiece from wangchanberta model with Vocabulary Expansion.
Subword tokenize of phayathaibert, \
sentencepiece from wangchanberta model with Vocabulary Expansion.
:param str text: text to be tokenized
:return: list of subwords
Expand All @@ -375,4 +384,4 @@ def segment(sentence: str) -> List[str]:
if not sentence or not isinstance(sentence, str):
return []

return _tokenizer.tokenize(sentence)
return _tokenizer.tokenize(sentence)
7 changes: 4 additions & 3 deletions pythainlp/tag/named_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,10 @@ def load_engine(self, engine: str, corpus: str) -> None:
)
)

def tag(
self, text, pos = False, tag = False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
def tag(self,
text, pos = False,
tag = False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
"""
This function tags named entities in text in IOB format.
Expand Down
8 changes: 4 additions & 4 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,10 @@ def pos_tag_transformers(
)
)


pipeline = TokenClassificationPipeline(
model = model, tokenizer = tokenizer, aggregation_strategy = "simple"
)
pipeline = TokenClassificationPipeline(model = model,
tokenizer = tokenizer,
aggregation_strategy = "simple",
)

outputs = pipeline(sentence)
word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]]
Expand Down

0 comments on commit 76b49c3

Please sign in to comment.