-
Notifications
You must be signed in to change notification settings - Fork 4
/
annotate.py
119 lines (94 loc) · 4.03 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import sys
import argparse
import csv
import re
import fiona
import shapely.geometry
from shapely.geometry import Point
# A datastructure to hold a shape (neighborhood, beat, etc)
class Shape:
def __init__(self, feature, property_name):
self.polygon = shapely.geometry.shape(feature['geometry'])
self.name = feature['properties'][property_name]
self.centroid = self.polygon.centroid
def sort_shapes(shapes, point):
"""
Sorts the shapes by distance from the shape centroid to the given point
"""
shapes.sort(key = lambda shape: shape.centroid.distance(point))
return shapes
def find_neighborhood(shapes, lat, lng):
"""
Finds the neighborhood given the lat, lng
"""
point = Point(lng, lat)
# sorts the shapes by distance from shape centroid the point
sorted_shapes = sort_shapes(shapes, point)
# now that shapes are sorted, we should find the
# boundary shape pretty quick
for shape in sorted_shapes:
if shape.polygon.contains(point):
return shape.name
return None
lat_lng_rg = re.compile('.*?([+-]?\\d*\\.\\d+)(?![-+0-9\\.]).*?([+-]?\\d*\\.\\d+)(?![-+0-9\\.])')
def parse_lat_lng(lat_lng_string):
"""
Turns the Location column into (lat, lng) floats
May look like this "(29.98645605, -90.06910049)"
May have degree symbol "(29.98645605°,-90.06910049°)"
"""
m = lat_lng_rg.search(lat_lng_string)
if m:
return (float(m.group(1)), float(m.group(2)))
else:
return (None, None)
def annotate_csv(in_file, out_file, options):
"""
Goes row by row through the in_file and
writes out the row to the out_file with
the new Neighbhorhood column
"""
dataset = options.shape_dataset
if dataset == 'neighborhoods':
property_name = 'gnocdc_lab'
elif dataset == 'city_council_districts':
property_name = 'NAME'
elif dataset == 'voting_precincts':
property_name = 'PRECINCTID'
shape_file = "data/%s/%s.shp" % (dataset, dataset)
fc = fiona.open(shape_file)
shapes = [Shape(feature, property_name) for feature in fc]
reader = csv.reader(in_file)
writer = csv.writer(out_file)
# Write headers first, add new neighborhood column
headers = reader.next()
headers.append(options.output_column)
writer.writerow(headers)
for row in reader:
# WGS84 point, "Location" column, is last element
lat, lng = parse_lat_lng(row[options.loc_column])
if lat and lng:
neighborhood = find_neighborhood(shapes, lat, lng)
else:
neighborhood = 'N/A'
row.append(neighborhood)
writer.writerow(row)
print("#%s lat: %s lng: %s -> %s" % (reader.line_num, lat, lng, neighborhood))
def main(options):
if not options.loc_column and (not (options.lat_column and options.lng_column)):
sys.exit('You must set either --loc-column or both --lat-column and --lng-column')
with open(options.input_file, 'r') as in_file:
with open(options.output_file, 'w') as out_file:
annotate_csv(in_file, out_file, options)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Annotate csv file with shape tags')
parser.add_argument('input_file', metavar='input_file', help='the input csv file path')
parser.add_argument('output_file', metavar='output_file', help='the output csv file path')
parser.add_argument('output_column', metavar='output_column', help='the name of the column you wish to add')
parser.add_argument('shape_dataset', metavar='shape_dataset', help='the dataset of shapes to use')
parser.add_argument('--lat-column', type=int, help="the 0-indexed column position for latitude (if in it's own column)")
parser.add_argument('--lng-column', type=int, help="the 0-indexed column position for longitude (if in it's own column)")
parser.add_argument('--loc-column', type=int, help="the 0-indexed column position for location (if lat and lng are in one column)")
main(parser.parse_args())