-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.py
152 lines (112 loc) · 4.59 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/python3
import re
import nltk
import sys
import getopt
import math
import os
import linecache
import _pickle as pickle
def usage():
print("usage: " + sys.argv[0] + " -i directory-of-documents -d dictionary-file -p postings-file")
def get_sorted_file_names(in_dir):
"""
Returns the list of file names of the documents to be indexed, sorted by their document IDs.
In this case, the file name acts as the document ID.
"""
# Convert file names to int to sort in natural numerical order
files = [int(f) for f in os.listdir(in_dir) if os.path.isfile(os.path.join(in_dir, f))]
files.sort()
return files
def create_postings_lists(in_dir):
"""
Returns the postings lists created from the documents in `in_dir`.
Applies sentence and word level tokenisation, stemming and case folding.
"""
# { word_type : [ docID, ... ] }
postings_lists = {}
ALL_DOC_IDS = '## all doc IDs ##' # Special token that has a postings list containing all doc IDs
postings_lists[ALL_DOC_IDS] = []
stemmer = nltk.stem.porter.PorterStemmer()
files = get_sorted_file_names(in_dir) # Get sorted names, since postings list should have sorted doc IDs
for docID in files:
file_path = os.path.join(in_dir, str(docID))
postings_lists[ALL_DOC_IDS].append(docID)
line_num = 1
line = linecache.getline(file_path, line_num)
while line != '':
for sent_token in nltk.sent_tokenize(line):
for word_token in nltk.word_tokenize(sent_token):
# Apply stemming and case folding after tokenization
stemmed_word_token = stemmer.stem(word_token).lower()
if stemmed_word_token not in postings_lists:
postings_lists[stemmed_word_token] = []
# Add doc ID to postings list
postings = postings_lists[stemmed_word_token]
if (len(postings) == 0 or postings[-1] != docID):
postings.append(docID)
line_num += 1
line = linecache.getline(file_path, line_num)
return postings_lists
def write_postings_list(postings_list, f):
"""
Returns the size of the stringified postings list written to the file.
eg. [1, 2, 5, 21] gets stringified to "1,2,5,21 " and returns 9.
"""
postings_list_str = ','.join([str(docID) for docID in postings_list]) + ' '
f.write(postings_list_str)
return len(postings_list_str)
def write_index_to_disk(postings_lists, out_dict, out_postings):
"""
Writes the postings lists and the in-memory dictionary to the output files.
"""
# {
# word_type : (
# num_docs, # Number of documents containing this word
# offset_bytes, # Position offset from start of postings file
# size_bytes, # Size of postings list written for this word
# skip_len # Number of skips performed by skip pointer
# )
# }
dictionary = {}
# Write postings lists to output file, and create the dictionary
f = open(out_postings, 'w')
offset = 0 # Number of bytes that have been written to file
for word in postings_lists:
num_docs = len(postings_lists[word])
skip_len = int(math.sqrt(num_docs)) # Heuristic for evenly-spaced skip pointers
size_bytes = write_postings_list(postings_lists[word], f)
dictionary[word] = (num_docs, offset, size_bytes, skip_len)
offset += size_bytes
f.close()
# Write dictionary to output file
f = open(out_dict, 'wb')
pickle.dump(dictionary, f)
f.close()
def build_index(in_dir, out_dict, out_postings):
"""
build index from documents stored in the input directory,
then output the dictionary file and postings file
"""
print('indexing...')
postings_lists = create_postings_lists(in_dir)
write_index_to_disk(postings_lists, out_dict, out_postings)
input_directory = output_file_dictionary = output_file_postings = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'i:d:p:')
except getopt.GetoptError:
usage()
sys.exit(2)
for o, a in opts:
if o == '-i': # input directory
input_directory = a
elif o == '-d': # dictionary file
output_file_dictionary = a
elif o == '-p': # postings file
output_file_postings = a
else:
assert False, "unhandled option"
if input_directory == None or output_file_postings == None or output_file_dictionary == None:
usage()
sys.exit(2)
build_index(input_directory, output_file_dictionary, output_file_postings)