-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
executable file
·148 lines (125 loc) · 5.7 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
import csv
import json
import os
import re
import sys
from optparse import OptionParser
class Item:
"""Base class for items containing the relevant data we're working with"""
def __init__(self, data):
self.data = data
def to_dict(self):
return self.data
class Ingredient(Item):
"""A recipe ingredient"""
def __init__(self, data):
super().__init__(data)
def split_amount(self):
"""Split the amount into value and unit"""
amount = self.data.pop('amount') # pop and re-add to keep the fields together
value, unit = split_ingredient_amount(amount)
self.data['amount'] = amount
self.data['value'] = value
self.data['unit'] = unit
def normalize_value(self):
"""Normalize the ingredient value (when present)"""
if self.data.get('value') is None: return
self.data['value'] = normalize_value(self.data['value'])
class Preparation(Item):
"""A recipe preparation step"""
pass
class Recipe(Item):
"""A recipe, the central element here"""
def __init__(self, data):
id = re.search(r'/rezepte/(\d+)/', data['url']).group(1)
super().__init__({ 'id': id, **data })
# ingredients and preparations have their own model
self.ingredients = [Ingredient({ 'recipe_id': id, 'index': n+1, **i }) for n,i in enumerate(self.data.pop('ingredients'))]
self.preparation = [Preparation({ 'recipe_id': id, 'index': n+1, 'text': p }) for n,p in enumerate(self.data.pop('preparation'))]
# durations and tags are lists we want to squash to a single value
if self.data.get('durations') is not None:
self.data['durations'] = ' | '.join([k + ': ' + v for k,v in self.data['durations'].items()])
if self.data.get('tags') is not None:
self.data['tags'] = ' | '.join(self.data['tags'])
class CSVWriter:
"""A generic CSV writer for dicts, emitting headers based on the first item seen"""
def __init__(self, filename):
self.file = open(filename, 'w', newline='')
self.writer = None
def write(self, data):
if self.writer is None:
self.writer = csv.DictWriter(self.file, data.keys())
self.writer.writeheader()
self.writer.writerow(data)
class RecipeWriter:
"""Writes a recipe to three different CSV files"""
def __init__(self, output_prefix):
self.recipes = CSVWriter(output_prefix + '.csv')
self.ingredients = CSVWriter(output_prefix + '.ingredients.csv')
self.preparation = CSVWriter(output_prefix + '.preparation.csv')
def write(self, recipe):
self.recipes.write(recipe.to_dict())
for ingredient in recipe.ingredients:
self.ingredients.write(ingredient.to_dict())
for preparation in recipe.preparation:
self.preparation.write(preparation.to_dict())
def process_file(input, output, split_ingredients=False, normalize_values=False):
"""Convert the JSON lines input file to the various CSV files"""
with open(input, 'r') as infile:
recipe_writer = RecipeWriter(output)
for line in infile.readlines():
recipe = Recipe(json.loads(line))
for i in recipe.ingredients:
if split_ingredients: i.split_amount()
if normalize_values: i.normalize_value()
recipe_writer.write(recipe)
def main():
parser = OptionParser(
usage='usage: %prog [options] inputfile.jsonl',
description='Convert the JSON Lines file returned from the recipe spider '
'into separate CSV files, with additional post-processing.'
)
parser.add_option('-o', '--output', dest='output',
help='Prefix CSV filenames with PREFIX (default derived from input file)', metavar='PREFIX')
parser.add_option('-s', '--split-ingredient-amount', dest='split', action='store_true',
help='Split ingredient amounts into value and unit')
parser.add_option('-n', '--normalize-ingredient-value', dest='normalize', action='store_true',
help='Normalize ingredient value (only makes sense with --split-ingredient-amount)')
(options, args) = parser.parse_args()
if len(args) != 1:
parser.print_usage()
sys.exit(1)
input = args[0]
output = options.output or os.path.splitext(input)[0]
process_file(input, output, split_ingredients=options.split, normalize_values=options.normalize)
FRACTIONS ={
'½': 1/2, '⅓': 1/3, '⅕': 1/5, '⅙': 1/6, '⅛': 1/8,
'⅔': 2/3, '⅖': 2/5, '⅚': 5/6, '⅜': 3/8, '¾': 3/4, '⅗': 3/5,
'⅝': 5/8, '⅞': 7/8, '⅘': 4/5, '¼': 1/4, '⅐': 1/7, '⅑': 1/9, '⅒': 1/10
}
INGREDIENT_AMOUNT_RE = re.compile('\\s*([0-9]*\\s*[' + ''.join(FRACTIONS.keys()) + ']|[0-9,.]+)\\s*(\\w.*)?$')
VALUE_WITH_FRACTION_RE = re.compile('^([0-9]*)\\s*([' + ''.join(FRACTIONS.keys()) + '])$')
def split_ingredient_amount(s):
"""Return the value and unit for an ingredient amount like '500g' or '½ Stiel/e' etc."""
# number and optional unit
m = re.match(INGREDIENT_AMOUNT_RE, s)
if m: return m.group(1), m.group(2)
return None, s
def normalize_value(s):
"""Normalize a numerical ingredient amount value"""
if s is None: return
s = s.strip()
s = re.sub(r'^([0-9]+),([0-9]+)$', r'\1.\2', s) # use point as decimal separator - can break with thousand separators (!)
m = re.match(VALUE_WITH_FRACTION_RE, s)
if m:
whole = int(m.group(1)) if m.group(1) != '' else 0
return whole + FRACTIONS[m.group(2)]
elif re.match(r'^[0-9]+$', s):
return int(s)
elif re.match(r'^[0-9]+\.[0-9]+', s):
return float(s)
else:
return s
if __name__ == '__main__':
main()