-
Notifications
You must be signed in to change notification settings - Fork 0
/
zagat_data.py
99 lines (70 loc) · 2.46 KB
/
zagat_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# coding=utf-8
"""
Simple PoC for scraping Zagat ratings given a restaurant name search string
"""
import json
import re
import requests
from lxml import etree
def find_zagat(name, key):
"""
Initially try the full name of the restaurant, but then try simpler strings.
"""
title, url = search_zagat(name, key)
if not url:
truncated_name = re.split(ur'-|—', name)[0] # many restaurant names have a dash for neighborhood in them
if truncated_name != name:
print u"trying truncated: {}".format(truncated_name)
title, url = search_zagat(truncated_name, key)
if not url:
restaurant_stripped_name = re.sub(ur'Restaurant.*|Ristorante.*', ur'', truncated_name)
if restaurant_stripped_name != truncated_name:
print u"trying stripping restaurant: {}".format(restaurant_stripped_name)
title, url = search_zagat(restaurant_stripped_name, key)
return title, url
def search_zagat(name, key):
"""
Uses the Zagat search endpoint to find restaurants by name. This is less comprehensive than a true search on Zagat.
"""
# Do a search on zagat.com web and find your 32 character key from the search's GET request
params = {
'key': key,
'query': name,
}
search_response = requests.get(u"https://www.zagat.com/proxy/v1.4?m=search-suggest&city=1020", params=params)
result = search_response.json()
# Assume it's the first search result we want, but grab the title to
# compare later and be "sure".
try:
title = result["data"]["results"][0]["title"]
url = result["data"]["results"][0]["url"]
except (KeyError, IndexError):
title = None
url = None
return title, url
def get_zagat_data(url):
"""
Scrape Zagat restaurant page for ratings / dollars.
"""
rating = "NA"
dollars = "NA"
place_response = requests.get(url)
tree = etree.HTML(place_response.content)
rating_match = tree.xpath('//*[@id="content"]/div[1]/div/div[1]/div/div/div[1]/div[1]')
try:
rating = rating_match[0].text
except IndexError as exc:
pass
except Exception as exc:
print exc
dollars_match = tree.xpath('//*[@id="container"]/div[2]/span/span[3]')
try:
dollars = dollars_match[0].text
except IndexError as exc:
pass
except Exception as exc:
print exc
return {
'food_rating': rating,
'dollars': dollars
}