-
Notifications
You must be signed in to change notification settings - Fork 0
/
general.py
93 lines (71 loc) · 2.34 KB
/
general.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from nltk import word_tokenize
# import sys
# from importlib import reload
# reload(sys)
# sys.setdefaultencoding('utf8')
def preprocess(s):
tokenized = ' '.join(word_tokenize(s))
tokenized = tokenized.lower()
tokenized = tokenized.strip()
return tokenized
def sentence_to_ngram(sentence, n):
s = preprocess(sentence).split(' ')
ngrams = [' '.join(s[i:i + n]) for i in range(len(s) - n + 1)]
return ngrams
def sentencelist_to_ngram(sentence_list, n, ngram_filter_set=None):
all_ngrams = {}
i = 0
for s in sentence_list:
ngrams = sentence_to_ngram(s, n)
for ng in ngrams:
all_ngrams[ng] = all_ngrams.get(ng, 0) + 1
print('progress: ', "{0:.2f}".format((100 * i) / len(sentence_list)), '%', end='\r')
i += 1
N = sum(all_ngrams.values())
V = len(all_ngrams)
if ngram_filter_set:
all_ngrams = filter_ngrams(all_ngrams, ngram_filter_set)
return all_ngrams, N, V
def filter_ngrams(ngrams, filter_set):
results = {}
for ng in filter_set:
if ng in ngrams:
results[ng] = ngrams[ng]
return results
def ngrams(config, ngram_filter, order):
with open(config['PATH']['CORPUS'],) as f:
content = f.readlines()
content = [x.strip() for x in content]
ngs, N, V = sentencelist_to_ngram(content, order, ngram_filter)
return ngs, N, V
def extract_constituent(compounds):
constituetns = []
for c in compounds:
constituetns.extend(c.split(' '))
return constituetns
def extract_from_sub(sub):
all = []
all.append(sub.compound)
all.extend(sub.modifier_alts)
all.extend(sub.head_alts)
all.extend(sub.combined_alts)
all.extend(extract_constituent(all))
return all
def create_filter(compound_to_alts):
filter = set()
for cmpnd, subs in compound_to_alts.items():
tmp = extract_from_sub(subs)
for t in tmp:
filter.add(t)
return filter
def sort_by_value(dictionary):
return sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
def print_tuples(list_of_tuple):
for k in list_of_tuple:
print(k[0], '\t', k[1])
def write_tuples(list_of_tuple, path_to_file):
with open(path_to_file, 'w') as f:
for k in list_of_tuple:
f.write(k[0] + '\t' + "{0:.3f}".format(k[1]) + '\n')
f.flush()
f.close()