-
Notifications
You must be signed in to change notification settings - Fork 0
/
knowledge.py
322 lines (262 loc) · 10.8 KB
/
knowledge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
from bs4 import BeautifulSoup
import requests
import re
from classes import Movie
from classes import Theatre
from showtime import Time
start = "http://www.google.com/movies?near=bangalore&date=0&sort=1"
'''
get_titles()
iterate through pages of google 'near bangalore' movie listings.
add all movie titles on page to list of titles.
attempt to continue to next page as long as there
have been new movies added on this page
returns set of title names for movies playing Today
'''
def get_titles(startUrl=start):
url = startUrl
title_num = 0
titles = set()
c = 1
while len(titles) != title_num or c:
c = 0
title_num = len(titles)
try:
r = requests.get(url)
except requests.exceptions.ConnectionError:
print("Err: Failed to connect to the internet")
return
soup = BeautifulSoup(r.text, from_encoding="utf-8")
movies_html = soup.find_all("div", attrs={'class': 'movie'})
mov1 = [mov.find('h2') for mov in movies_html if mov.find('h2')]
[titles.add(m1.text) for m1 in mov1]
url = startUrl + "&start=" + str(len(titles))
return titles
# list of titles by date. example of parsed titles from June 30th
titles0630 = set([u'Pandaga Chesko', u'Krishna Leela', u'Yagavarayinum Naa Kaakka', u'Inside Out 3d',
u'Jurassic World', u'Vajrakaya', u'Indru Netru Naalai', u'Dil Dhadakne Do', u'Hamari Adhuri Kahani',
u'Tiger', u'Abcd 2', u'Tanu Weds Manu Returns', u'Miss Tanakpur Haazir Ho', u'Me', u'Kaaka Muttai',
u'Where Is Vidya Balan', u'Spy', u'Jadoogadu', u'Romeo Juliet', u'Ganapa', u'Insidious Chapter 3',
u'Labour Of Love', u'Lodukku Pandi', u'Entourage', u'Goolihatti', u'Mr And Mrs Ramachari', u'Ranna',
u'Aata Paata', u'Premam', u'Bela Sheshe', u'Thinkal Muthal Velli Vare'])
'''
parse_theatres()
all 81 known theatres in bangalore are saved in bms_theatres.txt.
file is edited for uniformity (Theatre name: Address, Region, whatever)
open file and parse titles into Theatre objects
-removes the word bangalore/bengaluru from title
(file has been edited: banerghatta to bannerghatta, rajajinagar to
rajaji nagar)
deletes current set of theatres before starting
This is the only time that Theatre instances are initialised.
All other accesses are to modify the movies attribute
of a particular theatre
Must be called before anything else references to Theatres
example lines from bms_theatres:
Innovative Multiplex: Marathahalli
INOX Lido: Off MG Road, Ulsoor
INOX: Forum Value Mall, Whitefield
INOX: Jayanagar - Garuda Swagath Mall
'''
def parse_theatres():
Theatre.theatres = []
with open('bms_theatres', mode='r') as f:
line = f.readline().strip()
while line:
if len(line) > 15:
ind = re.match(r"^(\w+(\s?))+", line).end()
company = line[:ind]
# split the address into list of region, mall, street
address = re.split("\s\-\s|,\s", line[ind+2:])
address = [a for a in address if a.lower() != ("bengaluru" or "bangalore")]
theatre = Theatre(line, address, company)
line = f.readline().strip()
return
'''
helper function for get_theatres
'''
def clean_inp(inp):
inp = inp.lower()
# manual special cases
known_not_in_bms = set([u.lower() for u in [u'Sri Radhakrishna',
u'Sri Vinayaka Theatre Marathahalli', u'Olympia Theatre',
u'Sri Renuka Prassana', u'Movieland']])
if inp in known_not_in_bms:
return False, "Err: Not in the book my show theatre listing"
# special cases,
# rockline cinema means rockline mall
if "rockline" in inp:
inp += " carnival"
return True, inp
'''
sometimes address names have gaps, cannot directly do
a string comparison. promoting uniformity in bms_theatres file,
hence must be lenient with scraping names
ex: rajajinagar == rajaji nagar
banerghatta == bannerghatta
function checks if String addr is in String check_in, returns boolean
'''
def special_in(addr, check_in):
if addr in check_in:
return True
i1 = re.search(r"\Snagar|\snagar", addr)
if i1 is not None:
i = i1.start()
if addr[:i + 1] + ' ' + addr[i + 1:] in check_in:
return True
if addr[:i] + addr[i + 1:] in check_in:
return True
i2 = re.search(r"banerghatta|bannerghatta", addr)
if i2 is not None:
i = i2.start()
if addr[:i] + "banerghatta" in check_in:
return True
if addr[:i] + "bannerghatta" in check_in:
return True
return False
'''
clean(t)
helper function for search_theatres()
cleaning out unnecessary words from theatre names set
input: list of String theatre names
returns: list of String theatre names
'''
def clean(t):
superfluous = ["cinema(s?)(\s|\Z)", "theatre(s?)", "digital 2k", "digital 4k"]
t2 = []
for word in t:
word = word.lower()
for s in superfluous:
word = re.sub(s, "", word)
t2.append(word.strip())
return t2
'''
search_theatres()
takes in String name, s scraped google theatre name, and matches it to a
single known theatre. must match to a single known theatre, otherwise it fails
(not the same as searching customer input theatre, because
there is no option for anyone to narrow down the search, but similar)
returns tuple of (Boolean found_instance, Theatre instance | error message)
'''
def search_theatres(inp):
theatre_names = [t.company.lower() for t in Theatre.theatres]
check, inp = clean_inp(inp)
if not check:
return check, inp
theatre_names2 = clean(theatre_names)
# checking if we can narrow it down by searching for the company name
# in our input string
# get the number of maximum matched words
def add_c(x):
return sum([1 for j in x.split() if special_in(j, inp)])
check = [(add_c(x), i) for i, x in enumerate(theatre_names2)]
m = max([c[0] for c in check])
if m == 0:
return False, "Err: Nothing matched"
# find number of theatres which are equally max. matched
# create list of such theatres to narrow down further
# check = (num_occurences, index) []
check = [c for c in check if c[0] == m]
if len(check) == 1:
# got it
theatre = Theatre.theatres[check[0][1]]
else: # len(check) > 1
# must narrow down further, check locations for all the company names
# that the input matched to
theatre_locs = [t.address for t in [Theatre.theatres[i] for num, i in check]]
theatre_locs = [[a.lower() for a in address] for address in theatre_locs]
# choose the address which has the most keywords in common
check2 = [sum([(special_in(i, inp)) for i in t]) for t in theatre_locs]
# potential bug! if there is more than one option left here,
# it just chooses the first option
theatre = Theatre.theatres[check[check2.index(max(check2))][1]]
return True, theatre
'''
get_theatres():
iterate through google theatre listings
add theatres to dictionary, mapped to the movies they are showing
edits movies in dictionary
must have internet connection, otherwise returns False
invariants for namesToMovies and namesToTheatres:
we want the Movie and Theatres objects to have properly
capitalised names, but the key mappings should
all be lowercase
Movies keep track of String[] theaternames and Theaters keep track of
{String moviename:timings}. Hence getTheatres() returns two dictionaries
with the final dictionaries of Movies and Theatres. All edits should be
made directly to the dictionary, to make sure everything points to a single
instance of a movie/theatre. also returns a list of the original
google-scraped theatre names for testing
'''
def get_theatres():
parse_theatres()
namesToMovies = {}
namesToTheatres = {}
startUrl = "http://www.google.com/movies?near=bangalore&date=0"
url = startUrl
c = 1
theatres = []
theatreList = []
while len(theatres) != 0 or c:
c = 0
try:
r = requests.get(url)
except requests.exceptions.ConnectionError:
print("Err: Failed to connect to the internet")
return False
soup = BeautifulSoup(r.text, 'lxml', from_encoding="utf-8")
# numTheatres = len([m.start() for m in re.finditer('class=theater', r.text)])
theatres = soup.find_all("div", attrs={'class': 'theater'})
for t in theatres:
name = t.find('h2', class_='name')
try:
name = name.text
except AttributeError:
print("Err: theatre name doesn't exist")
continue
theatreList.append(name)
# match theatres found to a single known theatre
check, theatre = search_theatres(name)
if not check:
#print(name, theatre)
continue
known_name = theatre.bms_name.lower()
namesToTheatres[known_name] = theatre
# for each movie in the theater
# create movie if it doesn't exist
movs = t.find_all("div", class_='movie')
for mov in movs:
movieName = mov.find(class_='name')
try:
movieName = movieName.text
except AttributeError:
print("Err: movie name doesn't exist")
continue
if len(movieName) > 80:
print('Err: Text too long for movie name')
continue # todo, common known bug, parsed incorrectly somehow
def f(i):
t = i.text.replace(' ', '').strip()
ans1 = False
try:
ans = Time(t)
ans1 = True
except: ans = False
return ans1, ans
times = [f(i)[1] for i in mov.find_all(attrs={'style': 'color:'}) if f(i)[0]]
#print [t.printout() for t in times]
if movieName.lower() not in namesToMovies.keys():
namesToMovies[movieName.lower()] = Movie(movieName)
# add theatre to movie listing
namesToMovies[movieName.lower()].put(known_name)
# add movie to theatre listing
# keep timings in theatre listing
namesToTheatres[known_name].put(movieName.lower(), times)
url = startUrl + "&start=" + str(len(theatreList))
# add all theatres into dictionary, even if it doesn't have any movies for today
# that way, we can always recognise when a theatre is mentioned
for t in Theatre.theatres:
if t.bms_name.lower() not in namesToTheatres.keys():
namesToTheatres[t.bms_name.lower()] = t
print("Knowledge base loaded")
return namesToMovies, namesToTheatres, theatreList