diff --git a/model/1.5.2/thai-ner-1-5-newmm-orchid.ipynb b/model/1.5.2/thai-ner-1-5-newmm-orchid.ipynb new file mode 100644 index 0000000..01a0de4 --- /dev/null +++ b/model/1.5.2/thai-ner-1-5-newmm-orchid.ipynb @@ -0,0 +1,810 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "file_name=\"train\"\n", + "import codecs\n", + "from tqdm import tqdm\n", + "from pythainlp.tokenize import word_tokenize\n", + "from pythainlp.tag import pos_tag\n", + "from nltk.tokenize import RegexpTokenizer\n", + "import glob\n", + "import nltk\n", + "import re\n", + "# thai cut\n", + "thaicut=\"newmm\"\n", + "from sklearn.metrics import make_scorer\n", + "from sklearn.model_selection import cross_validate,train_test_split\n", + "import pycrfsuite\n", + "from pythainlp.corpus.common import thai_stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "stopwords = list(thai_stopwords())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#จัดการประโยคซ้ำ\n", + "data_not=[]\n", + "def Unique(p):\n", + " text=re.sub(\"<[^>]*>\",\"\",p)\n", + " text=re.sub(\"\\[(.*?)\\]\",\"\",text)\n", + " text=re.sub(\"\\[\\/(.*?)\\]\",\"\",text)\n", + " if text not in data_not:\n", + " data_not.append(text)\n", + " return True\n", + " else:\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# เตรียมตัวตัด tag ด้วย re\n", + "pattern = r'\\[(.*?)\\](.*?)\\[\\/(.*?)\\]'\n", + "tokenizer = RegexpTokenizer(pattern) # ใช้ nltk.tokenize.RegexpTokenizer เพื่อตัด [TIME]8.00[/TIME] ให้เป็น ('TIME','ไง','TIME')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# จัดการกับ tag ที่ไม่ได้ tag\n", + "def toolner_to_tag(text):\n", + " text=text.strip().replace(\"FACILITY\",\"LOCATION\").replace(\"[AGO]\",\"\").replace(\"[/AGO]\",\"\").replace(\"[T]\",\"\").replace(\"[/T]\",\"\")\n", + " text=re.sub(\"<[^>]*>\",\"\",text)\n", + " text=re.sub(\"(\\[\\/(.*?)\\])\",\"\\\\1***\",text)#.replace('(\\[(.*?)\\])','***\\\\1')# text.replace('>','>***') # ตัดการกับพวกไม่มี tag word\n", + " text=re.sub(\"(\\[\\w+\\])\",\"***\\\\1\",text)\n", + " text2=[]\n", + " for i in text.split('***'):\n", + " if \"[\" in i:\n", + " text2.append(i)\n", + " else:\n", + " text2.append(\"[word]\"+i+\"[/word]\")\n", + " text=\"\".join(text2)#re.sub(\"[word][/word]\",\"\",\"\".join(text2))\n", + " return text.replace(\"[word][/word]\",\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# แปลง text ให้เป็น conll2002\n", + "def text2conll2002(text,pos=True):\n", + " \"\"\"\n", + " ใช้แปลงข้อความให้กลายเป็น conll2002\n", + " \"\"\"\n", + " text=toolner_to_tag(text)\n", + " text=text.replace(\"''\",'\"')\n", + " text=text.replace(\"’\",'\"').replace(\"‘\",'\"')#.replace('\"',\"\")\n", + " tag=tokenizer.tokenize(text)\n", + " j=0\n", + " conll2002=\"\"\n", + " for tagopen,text,tagclose in tag:\n", + " word_cut=word_tokenize(text,engine=thaicut) # ใช้ตัวตัดคำ newmm\n", + " i=0\n", + " txt5=\"\"\n", + " while i= 3584 and cVal <= 3711):\n", + " return True\n", + " return False\n", + "def isThaiWord(word):\n", + " t=True\n", + " for i in word:\n", + " l=isThai(i)\n", + " if l!=True and i!='.':\n", + " t=False\n", + " break\n", + " return t\n", + "\n", + "def is_stopword(word):\n", + " return word in stopwords\n", + "def is_s(word):\n", + " if word == \" \" or word ==\"\\t\" or word==\"\":\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "def lennum(word,num):\n", + " if len(word)==num:\n", + " return True\n", + " return False\n", + "def doc2features(doc, i):\n", + " word = doc[i][0]\n", + " postag = doc[i][1]\n", + " # Features from current word\n", + " features={\n", + " 'word.word': word,\n", + " 'word.stopword': is_stopword(word),\n", + " 'word.isthai':isThaiWord(word),\n", + " 'word.isspace':word.isspace(),\n", + " 'postag':postag,\n", + " 'word.isdigit()': word.isdigit()\n", + " }\n", + " if word.isdigit() and len(word)==5:\n", + " features['word.islen5']=True\n", + " if i > 0:\n", + " prevword = doc[i-1][0]\n", + " postag1 = doc[i-1][1]\n", + " features['word.prevword'] = prevword\n", + " features['word.previsspace']=prevword.isspace()\n", + " features['word.previsthai']=isThaiWord(prevword)\n", + " features['word.prevstopword']=is_stopword(prevword)\n", + " features['word.prepostag'] = postag1\n", + " features['word.prevwordisdigit'] = prevword.isdigit()\n", + " else:\n", + " features['BOS'] = True # Special \"Beginning of Sequence\" tag\n", + " # Features from next word\n", + " if i < len(doc)-1:\n", + " nextword = doc[i+1][0]\n", + " postag1 = doc[i+1][1]\n", + " features['word.nextword'] = nextword\n", + " features['word.nextisspace']=nextword.isspace()\n", + " features['word.nextpostag'] = postag1\n", + " features['word.nextisthai']=isThaiWord(nextword)\n", + " features['word.nextstopword']=is_stopword(nextword)\n", + " features['word.nextwordisdigit'] = nextword.isdigit()\n", + " else:\n", + " features['EOS'] = True # Special \"End of Sequence\" tag\n", + " return features\n", + "\n", + "def extract_features(doc):\n", + " return [doc2features(doc, i) for i in range(len(doc))]\n", + "\n", + "def get_labels(doc):\n", + " return [tag for (token,postag,tag) in doc]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100% 5089/5089 [00:08<00:00, 581.88it/s] \n", + "100% 5089/5089 [00:00<00:00, 113477.69it/s]\n" + ] + } + ], + "source": [ + "X_data = [extract_features(doc) for doc in tqdm(train_data)]\n", + "y_data = [get_labels(doc) for doc in tqdm(train_data)]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.03 s, sys: 35.9 ms, total: 3.07 s\n", + "Wall time: 3.07 s\n" + ] + } + ], + "source": [ + "%%time\n", + "trainer = pycrfsuite.Trainer(verbose=False)\n", + "i=0\n", + "for xseq, yseq in zip(X_data, y_data):\n", + " try:\n", + " trainer.append(xseq, yseq)\n", + " i+=1\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.set_params({\n", + " 'c1': 0.1, # coefficient for L1 penalty\n", + " 'c2': 0.1, # coefficient for L2 penalty\n", + " 'max_iterations': 500, # stop earlier\n", + "\n", + " # include transitions that are possible, but not observed\n", + " 'feature.possible_transitions': True\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['feature.minfreq',\n", + " 'feature.possible_states',\n", + " 'feature.possible_transitions',\n", + " 'c1',\n", + " 'c2',\n", + " 'max_iterations',\n", + " 'num_memories',\n", + " 'epsilon',\n", + " 'period',\n", + " 'delta',\n", + " 'linesearch',\n", + " 'max_linesearch']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.params()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 11min 51s, sys: 290 ms, total: 11min 51s\n", + "Wall time: 11min 51s\n" + ] + } + ], + "source": [ + "%%time\n", + "trainer.train('thai-ner-1-5-newmm-orchid.crfsuite')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100% 1274/1274 [00:02<00:00, 597.40it/s]\n", + "100% 1274/1274 [00:00<00:00, 161749.10it/s]\n" + ] + } + ], + "source": [ + "X_test = [extract_features(doc) for doc in tqdm(test_data)]\n", + "y_test = [get_labels(doc) for doc in tqdm(test_data)]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import LabelBinarizer\n", + "from itertools import chain\n", + "from sklearn.metrics import classification_report\n", + "def bio_classification_report(y_true, y_pred):\n", + " \"\"\"\n", + " Classification report for a list of BIO-encoded sequences.\n", + " It computes token-level metrics and discards \"O\" labels.\n", + " \n", + " Note that it requires scikit-learn 0.15+ (or a version from github master)\n", + " to calculate averages properly!\n", + " \"\"\"\n", + " lb = LabelBinarizer()\n", + " y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))\n", + " y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))\n", + " \n", + " tagset = set(lb.classes_) - {'O'}\n", + " tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])\n", + " class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}\n", + " \n", + " return classification_report(\n", + " y_true_combined,\n", + " y_pred_combined,\n", + " labels = [class_indices[cls] for cls in tagset],\n", + " target_names = tagset,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tagger = pycrfsuite.Tagger()\n", + "tagger.open('thai-ner-1-5-newmm-orchid.crfsuite')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "labels = list(tagger.info().labels.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "labels.remove('O')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import f1_score" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 746 ms, sys: 4.02 ms, total: 750 ms\n", + "Wall time: 749 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "y_pred = [tagger.tag(xseq) for xseq in X_test]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "labels = list(tagger.info().labels.keys())\n", + "labels.remove('O')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " B-DATE 0.93 0.83 0.87 350\n", + " I-DATE 0.94 0.93 0.93 665\n", + " B-LAW 0.88 0.53 0.66 87\n", + " I-LAW 0.88 0.64 0.74 253\n", + " B-LEN 1.00 0.75 0.86 12\n", + " I-LEN 1.00 0.69 0.82 26\n", + " B-LOCATION 0.81 0.69 0.75 620\n", + " I-LOCATION 0.75 0.68 0.71 533\n", + " B-MONEY 0.99 0.88 0.93 131\n", + " I-MONEY 0.98 0.93 0.95 321\n", + "B-ORGANIZATION 0.91 0.69 0.79 1334\n", + "I-ORGANIZATION 0.79 0.72 0.75 1198\n", + " B-PERCENT 0.88 0.88 0.88 17\n", + " I-PERCENT 0.88 0.95 0.91 22\n", + " B-PERSON 0.95 0.77 0.85 607\n", + " I-PERSON 0.93 0.88 0.91 2181\n", + " B-PHONE 1.00 0.50 0.67 2\n", + " I-PHONE 1.00 1.00 1.00 8\n", + " B-TIME 0.93 0.64 0.76 87\n", + " I-TIME 0.94 0.76 0.84 158\n", + " B-URL 0.91 0.83 0.87 12\n", + " I-URL 0.93 0.96 0.94 94\n", + "\n", + " micro avg 0.89 0.79 0.84 8718\n", + " macro avg 0.92 0.78 0.84 8718\n", + " weighted avg 0.89 0.79 0.83 8718\n", + " samples avg 0.16 0.16 0.16 8718\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/opt/conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ], + "source": [ + "print(bio_classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "!mv thai-ner-1-5-newmm-orchid.crfsuite thainer_crf_1_5_2.model" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/model/1.5.2/thainer_crf_1_5_2.model b/model/1.5.2/thainer_crf_1_5_2.model new file mode 100644 index 0000000..c7a846e Binary files /dev/null and b/model/1.5.2/thainer_crf_1_5_2.model differ