forked from jneto04/ner-pt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ner_flair.py
113 lines (89 loc) · 3.47 KB
/
ner_flair.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.embeddings import BertEmbeddings
from flair.embeddings import CharacterEmbeddings, CharLMEmbeddings
from torch.optim.adam import Adam
from typing import List
from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter
from flair.optim import SGDW
import os
import torch
import gensim
print(" ")
columns = {0: 'token', 1:'pos', 2: 'sublabel', 3:'label'}
data_folder = "data/"
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
train_file="train_selective.txt",
test_file="test_selective.txt",
dev_file="dev_selective.txt")
print(" ")
print("Train len: ", len(corpus.train))
print("Test len: ", len(corpus.test))
print("Dev len: ", len(corpus.dev))
print(" ")
print("Train: ", corpus.train[0].to_tagged_string('label'))
print("Test: ", corpus.test[0].to_tagged_string('label'))
print("Dev: ", corpus.dev[0].to_tagged_string('label'))
tag_type = 'label'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(" ")
print("Tags: ")
print(tag_dictionary.idx2item)
print(" ")
#Loading NILC Word Embedding (Word2Vec_Skip-Gram_300d)
nilc_vectors = gensim.models.KeyedVectors.load_word2vec_format('skip_s300.txt')
nilc_vectors.save('nilc.gensim')
nilc_embedding = WordEmbeddings('nilc.gensim')
#Loading Flair Embedding
from flair.embeddings import FlairEmbeddings
flair_embedding_forward = FlairEmbeddings('flairBBP_forward-pt.pt')
flair_embedding_backward = FlairEmbeddings('flairBBP_backward-pt.pt')
embedding_types: List[TokenEmbeddings] = [
nilc_embedding,
flair_embedding_forward,
flair_embedding_backward,
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
use_crf=True)
from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=SGDW)
trainer.train('resources/taggers/example-ner',
learning_rate=0.1,
mini_batch_size=32,
embeddings_in_memory = False,
max_epochs=150,
checkpoint=True)
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('resources/taggers/example-ner/loss.tsv')
plotter.plot_weights('resources/taggers/example-ner/weights.txt')
path_eval = 'resources/taggers/example-ner/test.tsv'
new_file = open("conlleval_test.tsv", "w+", encoding="utf8")
######
# The output of the test dataset have four elements by line: 'token tag1 tag2 score'.
# We need only 'token tag1 tag2' for CoNLL-2002 Script.
######
with open(path_eval, "r", encoding="utf8") as file:
for line in file:
if line != "\n":
line = line.strip()
spliter = line.split(" ")
token = spliter[0]
tag_1 = spliter[1]
tag_2 = spliter[2]
new_file.write(str(token)+" "+str(tag_1)+" "+str(tag_2)+"\n")
else:
new_file.write(line)
new_file.close()
path_final = "conlleval_test.tsv"
print(" ")
print("--- CoNLL-02 METRICS EVALUATION ---")
print(" ")
os.system("perl conlleval_02.pl < %s"%(path_final))