-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_embedding.py
29 lines (24 loc) · 1.03 KB
/
word_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import concurrent.futures
model = SentenceTransformer("all-MiniLM-L6-v2")
def generate_embedding(word):
embeddings = model.encode(word, convert_to_tensor=True)
return word, embeddings.tolist()
word_file = "word_processed.txt"
with open(word_file, "r") as f:
words = f.read().splitlines()
embeddings_dict = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_word = {executor.submit(generate_embedding, word): word for word in words}
for future in tqdm(concurrent.futures.as_completed(future_to_word), total=len(words), desc="Generating embeddings"):
word = future_to_word[future]
try:
word, embedding = future.result()
embeddings_dict[word] = embedding
except Exception as exc:
print(f"Error generating embedding for {word}: {exc}")
output_file = "embeddings_dict.json"
with open(output_file, "w") as f:
json.dump(embeddings_dict, f)