-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_integrate_data.sql
executable file
·117 lines (104 loc) · 3.07 KB
/
01_integrate_data.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
--%% Create tables -------------------------------------------------------------
create table if not exists station (
snapshot_id integer,
station_id integer,
snapshot_date text,
name text,
capacity integer,
latitude real,
longitude real,
proxy_id integer,
proxy_latitude real,
proxy_longitude real,
proxy_distance real,
primary key (snapshot_id)
);
create table if not exists trip (
trip_id integer,
start_datetime text,
stop_datetime text,
start_station_id integer,
start_station_name text,
stop_station_id integer,
stop_station_name text,
primary key (trip_id),
foreign key (start_station_id, start_station_name)
references station (station_id, name),
foreign key (stop_station_id, stop_station_name)
references station (station_id, name)
);
--%% Populate tables -----------------------------------------------------------
.mode csv
.import '| tail -n +2 data/divvy_stations_2013-2017.csv' station
.import '| tail -n +2 data/divvy_trips_2013-2017.csv' trip
select * from station limit 10;
select * from trip limit 10;
--%% Add columns for station snapshot ids --------------------------------------
alter table trip
add column start_station_snapshot_id integer references station (snapshot_id);
alter table trip
add column stop_station_snapshot_id integer references station (snapshot_id);
--%% Join trip and station on station id, name, closest snapshot date ----------
update trip set
start_station_snapshot_id = coalesce(
(
select s_.snapshot_id
from station as s_ left join trip as t_
on s_.station_id = t_.start_station_id
where t_.trip_id = trip.trip_id
and s_.station_id = trip.start_station_id
and s_.name = trip.start_station_name
order by abs(
strftime('%s', t_.start_datetime) - strftime('%s', s_.snapshot_date)
)
limit 1
),
(
select s_.snapshot_id
from station as s_ left join trip as t_
on s_.station_id = t_.start_station_id
where t_.trip_id = trip.trip_id
and s_.station_id = trip.start_station_id
order by abs(
strftime('%s', t_.start_datetime) - strftime('%s', s_.snapshot_date)
)
limit 1
)
),
stop_station_snapshot_id = coalesce(
(
select s_.snapshot_id
from station as s_ left join trip as t_
on s_.station_id = t_.stop_station_id
where t_.trip_id = trip.trip_id
and s_.station_id = trip.stop_station_id
and s_.name = trip.stop_station_name
order by abs(
strftime('%s', t_.stop_datetime) - strftime('%s', s_.snapshot_date)
)
limit 1
),
(
select s_.snapshot_id
from station as s_ left join trip as t_
on s_.station_id = t_.stop_station_id
where t_.trip_id = trip.trip_id
and s_.station_id = trip.stop_station_id
order by abs(
strftime('%s', t_.stop_datetime) - strftime('%s', s_.snapshot_date)
)
limit 1
)
);
--%% Output trips --------------------------------------------------------------
.headers on
.mode list
.separator ,
.once 'data/divvy_trips_2013-2017.csv'
select
trip_id,
start_datetime,
stop_datetime,
start_station_snapshot_id,
stop_station_snapshot_id
from trip;