-
Notifications
You must be signed in to change notification settings - Fork 0
/
FastText_cbow.py
55 lines (44 loc) · 1.58 KB
/
FastText_cbow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import sys
import nltk
import multiprocessing
from gensim.models import FastText
#nltk.download('punkt')
path = os.getcwd() + '/raw'
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
for file in f:
if '.txt' in file:
files.append(os.path.join(r, file))
sentences = []
diccionario = dict()
for i, f in enumerate(files):
archivo = open(f, "r")
fl = archivo.readlines()
for x in fl:
tokens = nltk.word_tokenize(x)
sentences.append(tokens)
for word in tokens:
if word not in diccionario:
diccionario[word] = 1
else:
diccionario[word] += 1
print ("numero de oraciones presentes en el corpus " + str(len(sentences)))
print ("numero de palabras unicas " + str(len(diccionario)))
num_features = [20, 50, 100] #Dimensionality of the resulting word vectors
min_word_count = 1 #Minimum word count threshold
num_workers = multiprocessing.cpu_count() #Number of threads to run in parallel
context_size = 5 #Context window length
seed = 1 #Seed for the RNG, to make the result reproducible
for p in num_features:
fasttext_model = FastText(
sentences=sentences,
size=p,
window=context_size,
min_count=min_word_count,
workers=num_workers,
sg=0 #cbow
)
fasttext_model.wv.save_word2vec_format('model/fasttext_cbow_model_chemprot_' + str(p) + '.txt', binary=False)
del fasttext_model