-
Notifications
You must be signed in to change notification settings - Fork 0
/
loadDataInMongo.py
39 lines (30 loc) · 1009 Bytes
/
loadDataInMongo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import argparse
from pymongo import MongoClient, IndexModel
import os
from tqdm import tqdm
import json
def load_json(inputFile):
with open(inputFile) as fp:
return json.load(fp)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("inputPath", type=str, help="path to a wiktionary dump")
# parser.add_argument("outputPath", type=str, help="path to the outputed json file")
args = parser.parse_args()
inputPath = args.inputPath
# outputPath = args.outputPath
# extractAndDump(inputPath, outputPath)
collection = MongoClient()['wiktionary'][os.path.basename(inputPath).split('.')[0]]
collection.drop()
collection.drop_indexes()
collection.create_indexes([
IndexModel([('language', 1), ('synset', 1)])
])
data = load_json(inputPath)
for i, (lang, v) in enumerate(data.items()):
for synset, doc in tqdm(v.items(), "Importing {} ({}/{})".format(lang, i, len(data))):
collection.insert_one({
'language': lang,
'synset': synset,
**doc
})