forked from chauncyzhu/csdnSMP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word2vec_test.py
50 lines (45 loc) · 1.31 KB
/
word2vec_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# coding=gbk
"""
对Word2vec进行测试
"""
import gensim
import utils.data_path as dp
import jieba
numbers = 5
# 需要写入的文件
file_names = [dp.BlogContentSegPath + 'file1_' + str(i) + '.txt' for i in range(numbers)]
target_model_names = [dp.CSDNMODELPath + "csdn_model_1_" + str(i) + ".m" for i in range(numbers)]
"""
#首先对某个文件数据直接训练
file = open(file_names[0],encoding="utf8")
line = file.readline()
count = 0
result = []
while line:
if count % 10000 == 0:
print("已经对"+str(count)+"进行了分词")
line = line.strip().split("\001")
result.append(list(jieba.cut(line[2])))
line = file.readline()
count += 1
file.close()
model = gensim.models.Word2Vec(result, min_count=1)
model.save(target_model_names[0])
"""
#加载原先的模型,更新voca
model = gensim.models.Word2Vec.load(dp.CSDNMODELPath + "csdn_model_0.m")
file = open(file_names[0], encoding="utf8")
line = file.readline()
count = 0
result = []
while line:
if count % 10000 == 0:
print("已经对" + str(count) + "进行了分词")
line = line.strip().split("\001")
result.append(list(jieba.cut(line[2])))
line = file.readline()
count += 1
file.close()
model.build_vocab(result) #更新vocabulary
model.train(result) #更新参数
model.save(target_model_names[1]) #注意,这里是存放在model_1_1中的,上面是加载file_1_0