From a85f7f048a522d683ea0cd56a5c576e0b5aeee97 Mon Sep 17 00:00:00 2001 From: David Larlet Date: Fri, 13 Jan 2017 09:24:06 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=9E=20Stock2Reduce=20script?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * This script reduce a stock into matching conditions entries. --- CHANGELOG.md | 7 ++++- README.md | 29 +++++++++++++++++++++ flux2stock.py | 1 + stock2reduce.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 stock2reduce.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c2b18f..78e70e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## 2.4.0 β€” 2017-01-13 β€” πŸ’ž Stock2Reduce script + +* This script reduce a stock into matching conditions entries. + + ## 2.3.2 β€” 2017-01-12 β€” πŸ’ͺ Better parsing of zip files * The prior expectation was that the ziped file contains a csv file of the same name. @@ -10,7 +15,7 @@ * Some people explicitly want to be inserted/removed from the SIRENE file. -## 2.3.0 β€” 2016-12-29 β€” πŸ‘ͺ Flux2stock script +## 2.3.0 β€” 2016-12-29 β€” πŸ‘ͺ Flux2Stock script * This script converts a stock + flux into a new updated stock. diff --git a/README.md b/README.md index 03ba42b..8cab98b 100644 --- a/README.md +++ b/README.md @@ -740,6 +740,35 @@ The generation of a new stock takes aproximatively 15 minutes on a recent computer. The RAM consumption should stay low. +### Stock2Reduce + +The aim of that script (available at the root of the directory) is to +create a new filtered stock file based on the name of a column and its +value. + +You can use it that way: + +```shell +$ python stock2reduce.py stock.zip stock-paca.csv RPET=93 +``` + +Here `stock.zip` is the initial stock, `stock-paca.csv` is the name of +the newly generated stock, `RPET` is the name of the column and `93` +is the value matching of that column that you want to reduce on. + +The generation of a new stock takes aproximatively 7 minutes on a +recent computer. The RAM consumption should stay low. + +Note that you can combine many filters at once: + +```shell +$ python stock2reduce.py stock.zip stock-arles.csv DEPET=13 COMET=004 +``` + +Here we match both the county code and the town one to create a valid +and unique INSEE code and retrieve the stock related to Arles only. + + ## Contributing We’re really happy to accept contributions from the community, that’s the main reason why we open-sourced it! There are many ways to contribute, even if you’re not a technical person. diff --git a/flux2stock.py b/flux2stock.py index 82b0ac8..68b3b3d 100644 --- a/flux2stock.py +++ b/flux2stock.py @@ -79,6 +79,7 @@ def write_stock(stock_out, filtered_stock, modifications): print('Usages:') print(BASE_USAGE + 'stock-t+1.csv flux-t+1.zip') print(BASE_USAGE + 'stock-t+2.csv flux-t+1.zip flux-t+2.zip') + sys.exit() stock_in = sys.argv[1] stock_out = sys.argv[2] fluxs_zip = sys.argv[3:] diff --git a/stock2reduce.py b/stock2reduce.py new file mode 100644 index 0000000..b01fe67 --- /dev/null +++ b/stock2reduce.py @@ -0,0 +1,68 @@ +""" +This script reduce a stock into matching column name/value entries. + +Before hacking, please benchmark the current script with the real stock. +You should keep the duration and the RAM consumption as low as possible. +""" + +import csv +import io +import sys +from zipfile import ZipFile + + +def _parse_zip_csv_file(filename): + """Yield each row from a ziped CSV file coming from INSEE.""" + with ZipFile(filename) as zip_file: + for zip_info in zip_file.infolist(): + if not zip_info.filename.endswith('.csv'): + continue + with zip_file.open(zip_info.filename) as csv_file: + csvio = io.TextIOWrapper(csv_file, encoding='cp1252') + reader = csv.DictReader(csvio, delimiter=';') + for i, row in enumerate(reader): + # Not proud to pass fieldnames to each iteration. + # Better than a global var? + yield i, row, reader.fieldnames + + +def filter_stock(stock_in, conditions): + """Return only matching conditions entries.""" + for i, row, fieldnames in _parse_zip_csv_file(stock_in): + if all(row[column] == value for column, value in conditions): + yield i, row, fieldnames + + +def write_stock(stock_out, filtered_stock): + """ + Generate the new stock file with filtered entries. + + We mimick the initial stock with encoding, quotes and delimiters. + """ + with open(stock_out, 'w', encoding='cp1252') as csv_file: + _, first_row, fieldnames = next(filtered_stock) + # `extrasaction` is set to `ignore` to be able to pass more keys + # to the `writerow` method coming from the flux. + writer = csv.DictWriter( + csv_file, fieldnames=fieldnames, delimiter=';', + quoting=csv.QUOTE_ALL, extrasaction='ignore') + writer.writeheader() + # Because we already iterate once to retrieve fieldnames. + writer.writerow(first_row) + # Then write the updated stock. + for i, row, _ in filtered_stock: + writer.writerow(row) + + +if __name__ == '__main__': + if len(sys.argv) < 4: + BASE_USAGE = 'python stock2reduce.py stock.zip ' + print('Usages:') + print(BASE_USAGE + 'stock-paca.csv RPET=93') + print(BASE_USAGE + 'stock-arles.csv DEPET=13 COMET=004') + sys.exit() + stock_in = sys.argv[1] + stock_out = sys.argv[2] + conditions = [condition.split('=') for condition in sys.argv[3:]] + filtered_stock = filter_stock(stock_in, conditions) + write_stock(stock_out, filtered_stock)