-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.py
executable file
·76 lines (68 loc) · 2.28 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
import csv
import os
import subprocess
from bs4 import BeautifulSoup
def main():
fetch_schedule_data()
parse_schedule_data('weekday','north')
parse_schedule_data('weekday','south')
parse_schedule_data('weekend','north')
parse_schedule_data('weekend','south')
def fetch_schedule_data():
weekday_url = 'http://www.caltrain.com/schedules/weekdaytimetable.html'
weekend_url = 'http://www.caltrain.com/schedules/weekend-timetable.html'
basedir = os.getcwd()
subprocess.call(['mkdir', '-p', 'data'])
os.chdir('data')
subprocess.call(['curl', '-o', 'weekday.htm', weekday_url])
subprocess.call(['curl', '-o', 'weekend.htm', weekend_url])
os.chdir(basedir)
def parse_schedule_data(schedule, direction):
with open('data/%s.htm' % schedule) as f:
soup = BeautifulSoup(f, 'html.parser')
tbl = soup.select_one("table.%sB_TT" % direction[0].upper())
header = ['']
for tr in tbl.select('tr'):
valid = tr.select('th')
if len(valid) > 9:
for th in tr.select('th'):
train_id = th.text.replace("*","")
if len(train_id) < 4:
header.append(train_id)
rows = []
for tr in tbl.select('tr'):
valid = tr.select('th')
if len(valid) < 2:
continue
zone = valid[0].text.strip()
if len(zone) == 1: # busses don't have a zone
row = []
row.append(_parse_stop(valid[1].text))
times = tr.select('th')
for td in tr.select('td'):
row.append(_parse_time(td.text))
rows.append(row[0:len(header)])
with open('data/%s_%s.csv' % (schedule, direction), mode='w') as out_file:
csv_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(header)
for row in rows:
csv_writer.writerow(row)
def _parse_stop(text):
text = text.replace('Departs ', '').replace('Arrives ', '')
text = text.replace("So. San", "So San")
text = text.replace("SJ D", "San Jose D")
return text.replace(u'\xa0', u' ')
def _parse_time(text):
if ':' not in text:
return ''
h, m = text.strip().split(':')
if 'p' in m and int(h) < 12:
h = str(int(h) + 12)
elif 'a' in m and int(h) == 12:
h = str(int(h) + 12)
elif 'a' in m and int(h) < 3:
h = str(int(h) + 24)
return '%s:%s:00' % (h, m[0:2])
if __name__ == "__main__":
main()