-
Notifications
You must be signed in to change notification settings - Fork 453
/
postprocess.py
68 lines (50 loc) · 1.37 KB
/
postprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
import re
out_file = sys.argv[1]
prefix = [
'(learned[0-9]+ )+',
'we can conclude that',
'we have that',
'in conclusion,',
]
def strip_prefix(line):
for p in prefix:
res = re.search(p, line)
if res is not None:
line = re.split(p, line)[-1].strip()
break
return line
def convert_ansis_sentence(sentence):
ans = None
segs = re.search(r"the type of this document is(.*)", sentence)
if segs is not None:
segs = segs.groups()
ans = segs[0].strip()
return ans
all_lines = []
with open(out_file, "r", encoding="utf8") as fr:
for line in fr:
e = line.strip()
if len(e) > 0 and e[-1] == ".":
all_lines.append(e[:-1])
else:
all_lines.append(e)
hypothesis = []
cnt = 0
fail_cnt = 0
for i, line in enumerate(all_lines):
cnt += 1
strip_line = strip_prefix(line)
ans = convert_ansis_sentence(strip_line)
if ans is not None:
hypothesis.append(ans)
else:
hypothesis.append("failed")
fail_cnt += 1
print("Failed:id:{}, line:{}".format(i+1, line))
with open(f"{out_file}.extracted.txt", "w", encoding="utf8") as fw:
for eg in hypothesis:
print(eg, file=fw)
print(f"failed = {fail_cnt}, total = {cnt}")