-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval.py
137 lines (114 loc) · 4.6 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from nlgeval import compute_metrics
from nlgeval import compute_individual_metrics
from rouge import Rouge
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nlgeval import NLGEval
import argparse
nlgeval = NLGEval()
parser = argparse.ArgumentParser()
# outdated parameters
parser.add_argument("--ground_truth", default='./data/output/gd.out', type=str, required=False)
parser.add_argument("--predict", default='./data/output/gen.out', type=str, required=False)
parser.add_argument("--trans", default=True, type=bool, required=False)
# print arguments
args = parser.parse_args()
# 转换格式
if args.trans:
path = './model_gitter_rq2/'
with open('%s/test_1.output' % path) as hypo, open('%s/test_1.gold' % path) as ref, open(args.predict, "w") as f, open(args.ground_truth, "w") as f1:
print('preprocess')
hypo_lines = hypo.readlines()
ref_lines = ref.readlines()
for idx, hypo_line in enumerate(hypo_lines):
cols = hypo_line.strip().split('\t')
if len(cols) == 1:
hypo = cols[0]
else:
hypo = cols[1]
cols = ref_lines[idx].strip().split('\t')
if len(cols) == 1:
refer = cols[0]
else:
refer = cols[1]
f.write(hypo + '\n')
f1.write(refer + '\n')
smooth = SmoothingFunction()
metrics_dict = compute_metrics(hypothesis=args.predict,
references=[args.ground_truth], no_skipthoughts=True,
no_glove=True)
def compute_metrics1(predictions, labels):
decoded_preds = predictions
# Replace -100 in the labels as we can't decode them.
decoded_labels = labels
# 字符级别
#decoded_preds = [" ".join((pred.replace(" ", ""))) for pred in decoded_preds]
#decoded_labels = [" ".join((label.replace(" ", ""))) for label in decoded_labels]
# 词级别,分词
# decoded_preds = [" ".join(jieba.cut(pred.replace(" ", ""))) for pred in decoded_preds]
# decoded_labels = [" ".join(jieba.cut(label.replace(" ", ""))) for label in decoded_labels]
rouge = Rouge()
labels_lens = len(labels)
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for decoded_label, decoded_pred in zip(decoded_labels, decoded_preds):
total += 1
scores = rouge.get_scores(hyps=decoded_pred, refs=decoded_label)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(
weights=(1,0,0,0),
references=[decoded_label.split(' ')],
hypothesis=decoded_pred.split(' '),
smoothing_function=SmoothingFunction().method1
)
bleu /= len(decoded_labels)
rouge_1 /= total
rouge_2 /= total
rouge_l /= total
result = {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l}
print(result)
# 测试平均与分别计算是否一致
result2 = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
print(result2)
print(bleu)
# result = {'rouge-1': result['rouge-1']['f'], 'rouge-2': result['rouge-2']['f'], 'rouge-l': result['rouge-l']['f']}
result = {key: value * 100 for key, value in result.items()}
result["gen_len"] = np.mean(labels_lens)
result["bleu"] = bleu * 100
return result
def get_bleu(reference, hypothesis):
BLEUscore = sentence_bleu([reference.split(' ')], hypothesis.split(' '), weights = (1, 0, 0, 0), smoothing_function=smooth.method1)
return BLEUscore
def get_f1(bleu, rouge):
f1=(2*bleu*rouge)/(bleu+rouge)
return f1
rouge = Rouge()
rouge1_score = 0
rouge2_score = 0
rougeL_score = 0
bleu1_score = 0
f1_score = 0
hyps = []
refs = []
with open(args.predict) as hypo, open(args.ground_truth) as ref:
hypo_lines = hypo.readlines()
ref_lines = ref.readlines()
for idx, hypo_line in enumerate(hypo_lines):
hypo = hypo_line
refer = ref_lines[idx]
hyps.append(hypo)
refs.append(refer)
#metric = compute_individual_metrics(hypo_line, ref_lines[idx], no_skipthoughts=True, no_glove=True)
scores = rouge.get_scores([hypo], [refer])
rouge1_score += scores[0]['rouge-1']['f']
rouge2_score += scores[0]['rouge-2']['f']
rougeL_score += scores[0]['rouge-l']['f']
bleu1_score += get_bleu(refer, hypo)
print(rouge1_score/(idx+1))
print(rouge2_score/(idx+1))
print(rougeL_score/(idx+1))
print(bleu1_score/(idx+1))
#print(get_f1((bleu1_score/(idx+1)), (rouge1_score/(idx+1))))
print(compute_metrics1(hyps, refs))