forked from ci-s/GBDP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pre_processing.jl
128 lines (105 loc) · 3.41 KB
/
pre_processing.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
include("types.jl")
#import JLD2, Knet
#Pkg.add("JLD")
#Pkg.add("Knet")
using JLD2,Knet
const language_model = "english_chmodel.jld"
const data_file = "en-ud-dev.conllu"
function load_conllu(file,v::Vocab)
corpus = Any[]
s = Sentence(v)
for line in eachline(file)
if line == ""
push!(corpus, s)
s = Sentence(v)
elseif (m = match(r"^\d+\t(.+?)\t.+?\t(.+?)\t.+?\t.+?\t(.+?)\t(.+?)(:.+)?\t", line)) != nothing # modify that to use different columns
# id word lem upos xpos feat head deprel
#println(m.match, summary(m.match))
#println()
match_str = split(String(m.match), "\t")
push!(s.xpostag, String(match_str[5]))
push!(s.feats, String(match_str[6]))
word = m.captures[1]
push!(s.word, word)
postag = get(v.postags, m.captures[2], 0)
if postag==0
Base.warn_once("Unknown postags")
end
push!(s.postag, postag)
head = tryparse(Position, m.captures[3])
head = (head==nothing) ? -1 : head
if head==-1
Base.warn_once("Unknown heads")
end
push!(s.head, head)
deprel = get(v.deprels, m.captures[4], 0)
if deprel==0
Base.warn_once("Unknown deprels")
end
push!(s.deprel, deprel)
end
end
return corpus
end
# To create vocabulary from pre-trained lstm model, modify that to use different cols
function create_vocab(d)
Vocab(d["char_vocab"],
Dict{String, Int}(),
d["word_vocab"],
d["sosword"],
d["eosword"],
d["unkword"],
d["sowchar"],
d["eowchar"],
d["unkchar"],
get(d, "postags", UPOSTAG),
get(d, "deprels", UDEPREL)
)
end
function extend_vocab!(vocab::Vocab, train_corpus)
xpos = Dict()
feats = Dict()
xpos["<unk>"] = 1
feats["<unk>"] = 1
for sent in train_corpus
sent.xpostag = #[proc(xp, xpos) for xp in sent.xpostag]
[get!(xpos, xp, 1+length(xpos)) for xp in sent.xpostag]
sent.feats = [proc(feat, feats) for feat in sent.feats]
end
return ExtendedVocab(xpos, feats, vocab)
end
function extend_corpus!(ev::ExtendedVocab, val_corpus)
for sent in val_corpus
sent.xpostag = [get(ev.xpostags, xp, ev.xpostags["<unk>"])
for xp in sent.xpostag]
sent.feats = [proc(f, ev.feats, false)
for f in sent.feats]
end
return val_corpus
end
function proc(feat, feats, mutate=true)
feat == "_" && return []
splits = split(feat, "|")
stuff = []
for s in splits
push!(stuff, mutate ? get!(feats, s, length(feats)+1) : get(feats, s, feats["<unk>"]))
end
return stuff
end
function minibatch(corpus, batchsize; maxlen=typemax(Int), minlen=1, shuf=false)
data = Any[]
sorted = sort(corpus, by=length)
i1 = findfirst(x->(length(x) >= minlen), sorted)
if i1==0; error("No sentences >= $minlen"); end
i2 = findlast(x->(length(x) <= maxlen), sorted)
if i2==0; error("No sentences <= $maxlen"); end
for i in i1:batchsize:i2
j = min(i2, i+batchsize-1)
push!(data, sorted[i:j])
end
if shuf
data=shuffle(data)
end
return data
end
# Things we added