forked from majeau-bettez/ecospold2matrix
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ecospold2matrix.py
3441 lines (2765 loc) · 136 KB
/
ecospold2matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
""" ecospold2matrix - Class for recasting ecospold2 dataset in matrix form.
The module provides function to parse ecospold2 data, notably ecoinvent 3, as
Leontief A-matrix and extensions, or alternatively as supply and use tables for
the unallocated version of ecoinvent.
:PythonVersion: 3
:Dependencies: pandas 0.14.1 or more recent, scipy, numpy, lxml and xml
License: BDS
Authors:
Guillaume Majeau-Bettez
Konstantin Stadler
Evert Bouman
Radek Lonka
Credits:
This module re-uses/adapts code from brightway2data, more specifically the
Ecospold2DataExtractor class in import_ecospold2.py, changeset:
271:7e67a75ed791; Wed Sep 10; published under BDS-license:
Copyright (c) 2014, Chris Mutel and ETH Zürich
Neither the name of ETH Zürich nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE
COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""
import IPython
import os
import glob
import subprocess
from lxml import objectify
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import scipy.sparse
import scipy.io
import logging
import pickle
import csv
import shelve
import hashlib
import sqlite3
import re
import xlrd
import xlwt
import copy
# pylint: disable-msg=C0103
class Ecospold2Matrix(object):
"""
Defines a parser object that holds all project parameters and processes the
ecospold-formatted data into matrices of choice.
The two main functions of this class are ecospold_to_Leontief() and
ecospold_to_sut()
"""
# Some hardcoded stuff
__PRE = '{http://www.EcoInvent.org/EcoSpold02}'
__ELEXCHANGE = 'ElementaryExchanges.xml'
__INTERMEXCHANGE = 'IntermediateExchanges.xml'
__ACTIVITYINDEX = 'ActivityIndex.xml'
__DB_CHARACTERISATION = 'characterisation.db'
rtolmin = 1e-16 # 16 significant digits being roughly the limit of float64
def __init__(self, sys_dir, project_name, out_dir='.', lci_dir=None,
positive_waste=False, prefer_pickles=False, nan2null=False,
save_interm=True, PRO_order=['ISIC', 'activityName'],
STR_order=['comp', 'name', 'subcomp'],
verbose=True, version_name='ecoinvent31'):
""" Defining an ecospold2matrix object, with key parameters that
determine how the data will be processes.
Args:
-----
* sys_dir: directory containing the system description,i.e., ecospold
dataset and master XML files
* project_name: Name used to log progress and save results
* out_dir: Directory where to save result matrices and logs
* lci_dir: Directory where official cummulative LCI ecospold files are
* positive_waste: Whether or not to change sign convention and make
waste flows positive
[default false]
* prefer_pickles: If sys_dir contains pre-processed data in form of
pickle-files, whether or not to use those
[Default: False, don't use]
* nan2null: Whether or not to replace Not-a-Number by 0.0
[Default: False, don't replace anything]
* save_interm: Whether or not to save intermediate results as pickle
files for potential re-use
[Default: True, do it]
* PRO_order: List of meta-data used for sorting processes in the
different matrices.
[Default: first sort by order of ISIC code, then, within
each code, by order of activity name]
* PRO_order: List of meta-data used for sorting stressors (elementary
flows) in the different matrices.
[Default: first sort by order of compartment,
subcompartment and then by name]
Main functions and worflow:
---------------------------
self.ecospold_to_Leontief(): Turn ecospold files into Leontief matrix
representation
* Parse ecospold files, get products, activities, flows, emissions
* If need be, correct inconsistencies in system description
* After corrections, create "final" labels for matrices
* Generate symmetric, normalized system description (A-matrix,
extension F-matrix)
* Save to file (many different formats)
* Optionally, read cummulative lifecycle inventories (slow) and
compare to calculated LCI for sanity testing
self.ecospold_to_sut(): Turn unallocated ecospold into Suppy and Use
Tables
* Parse ecospold files, get products, activities, flows, emissions
* Organize in supply and use
* optionally, aggregate sources to generate a fully untraceable SUT
* Save to file
"""
# INTERMEDIATE DATA/RESULTS, TO BE GENERATED BY OBJECT METHODS
self.products = None # products, with IDs and descriptions
self.activities = None # activities, w IDs and description
self.inflows = None # intermediate-exchange input flows
self.outflows = None # intermediate-exchange output flows
self.elementary_flows = None # elementary flows
self.q = None # total supply of each product
self.PRO_old=None
self.STR_old = None
self.IMP_old=None
# FINAL VARIABLES: SYMMETRIC SYSTEM, NORMALIZED AND UNNORMALIZED
self.PRO = None # Process labels, rows/cols of A-matrix
self.STR = None # Factors labels, rows extensions
self.IMP = None # impact categories
self.A = None # Normalized Leontief coefficient matrix
self.F = None # Normalized factors of production,i.e.,
# elementary exchange coefficients
self.Z = None # Intermediate unnormalized process flows
self.G_pro = None # Unnormalized Process factor requirements
self.C = None # characterisation matrix
# Final variables, unallocated and unnormalized inventory
self.U = None # Table of use of products by activities
self.V = None # Table of supply of product by activities
# (ammounts for which use is recorded)
self.G_act = None # Table of factor use by activities
self.V_prodVol = None # Table of supply production volumes
# (potentially to rescale U, V and G)
# QUALITY CHECKS VARIABLES, TO BE GENERATED BY OBJECT METHODS.
self.E = None # cummulative LCI matrix (str x pro)
self.unsourced_flows = None # product flows without clear source
self.missing_activities = None # cases of no incomplete dataset, i.e.,
# no producer for a product
# PROJECT NAME AND DIRECTORIES, FROM ARGUMENTS
self.sys_dir = os.path.abspath(sys_dir)
self.project_name = project_name
self.out_dir = os.path.abspath(out_dir)
if lci_dir:
self.lci_dir = os.path.abspath(lci_dir)
else:
self.lci_dir = lci_dir
self.version_name = version_name
self.char_method = None # characterisation method set by
# read_characterisation function
self.data_version = None
# PROJECT-WIDE OPTIONS
self.positive_waste = positive_waste
self.prefer_pickles = prefer_pickles
self.nan2null = nan2null
self.save_interm = save_interm
self.PRO_order = PRO_order
self.STR_order = STR_order
# CREATE DIRECTORIES IF NOT IN EXISTENCE
if out_dir and not os.path.exists(self.out_dir):
os.makedirs(self.out_dir)
self.log_dir = os.path.join(self.out_dir, self.project_name + '_log')
# Fresh new log
os.system('rm -Rf ' + self.log_dir)
os.makedirs(self.log_dir)
# MORE HARDCODED PARAMETERS
# Subcompartment matching
self.obs2char_subcomp = pd.DataFrame(
columns=["comp", "obs_sc", "char_sc"],
data=[["soil", "agricultural", "agricultural"],
["soil", "forestry", "forestry"],
["air", "high population density", "high population density"],
["soil", "industrial", "industrial"],
["air", "low population density", "low population density"],
["water", "ocean", "ocean"],
["water", "river", "river"],
["water", "river, long-term", "river"],
["air", "lower stratosphere + upper troposphere",
"low population density"],
["air", "low population density, long-term",
"low population density"]
])
# Default subcompartment when no subcomp match and no "unspecified"
# defined
self.fallback_sc = pd.DataFrame(
columns=["comp", "fallbacksubcomp"],
data=[[ 'water','river'],
[ 'soil', 'industrial'],
[ 'air', 'low population density']
])
self._header_harmonizing_dict = {
'subcompartment':'subcomp',
'Subcompartment':'subcomp',
'Compartment':'comp',
'Compartments':'comp',
'Substance name (ReCiPe)':'charName',
'Substance name (SimaPro)':'simaproName',
'ecoinvent_name':'inventoryName',
'recipe_name':'charName',
'simapro_name':'simaproName',
'CAS number': 'cas',
'casNumber': 'cas',
'Unit':'unit' }
# Read in parameter tables for CAS conflicts and known synonyms
def read_pandas_csv(path):
tmp = pd.read_csv(path, sep='|', comment='#')
return tmp.where(pd.notnull(tmp), None)
self._cas_conflicts = read_pandas_csv('parameters/cas_conflicts.csv')
self._synonyms = read_pandas_csv('parameters/synonyms.csv')
self._custom_factors = read_pandas_csv('parameters/custom_factors.csv')
# POTENTIAL OTHER ISSUES
## Names that don't fit with their cas numbers
#['2-butenal, (2e)-', '123-73-9', '2-butenal',
# 'cas of (more common) E configuration; cas of mix is'
# ' rather 4170-30-3'],
#['3-(1-methylbutyl)phenyl methylcarbamate', '2282-34-0',
# 'bufencarb', 'resolve name-cas collision in ReCiPe: CAS'
# ' points to specific chemical, not bufencarb (008065-36-9),'
# ' which is a mixture of this substance and phenol,'
# ' 3-(1-ethylpropyl)-, 1-(n-methylcarbamate)'],
#['chlordane (technical)', '12789-03-6', None,
# 'pure chlordane has cas 000057-74-9, and is also defined'
# ' for cis and trans. This one here seems to be more of a'
# ' mixture or low grade, no formula in scifinder'],
# DEFINE LOG TOOL
self.log = logging.getLogger(self.project_name)
self.log.setLevel(logging.INFO)
self.log.handlers = [] # reset handlers
if verbose:
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
fh = logging.FileHandler(os.path.join(self.log_dir,
project_name + '.log'))
fh.setLevel(logging.INFO)
aformat = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
formatter = logging.Formatter(aformat)
fh.setFormatter(formatter)
self.log.addHandler(fh)
if verbose:
ch.setFormatter(formatter)
self.log.addHandler(ch)
# RECORD OBJECT/PROJECT IDENTITY TO LOG
self.log.info('Ecospold2Matrix Processing')
try:
gitcommand = ["git", "log", "--pretty=format:%H", "-n1"]
githash = subprocess.check_output(gitcommand).decode("utf-8")
self.log.info("Current git commit: {}".format(githash))
except:
pass
self.log.info('Project name: ' + self.project_name)
# RECORD PROJECT PARAMETERS TO LOG
self.log.info('Unit process and Master data directory: ' + sys_dir)
self.log.info('Data saved in: ' + self.out_dir)
if self.lci_dir:
self.log.info('Official rolled-up life cycle inventories in: ' +
self.lci_dir)
if self.positive_waste:
self.log.info('Sign conventions changed to make waste flows '
'positive')
if self.prefer_pickles:
self.log.info('When possible, loads pickled data instead of'
' parsing ecospold files')
if self.nan2null:
self.log.info('Replace Not-a-Number instances with 0.0 in all'
' matrices')
if self.save_interm:
self.log.info('Pickle intermediate results to files')
self.log.info('Order processes based on: ' +
', '.join([i for i in self.PRO_order]))
self.log.info('Order elementary exchanges based on: ' +
', '.join([i for i in self.STR_order]))
database_name = self.project_name + '_' + self.__DB_CHARACTERISATION
os.system('rm ' + database_name)
try:
self.conn = sqlite3.connect(self.project_name + '_' + self.__DB_CHARACTERISATION)
self.initialize_database()
except:
self.log.warning("Could not establish connection to database")
pass
self.conn.commit()
# =========================================================================
# MAIN FUNCTIONS
def ecospold_to_Leontief(self, fileformats=None, with_absolute_flows=False,
lci_check=False, rtol=1e-2, atol=1e-5, imax=3,
characterisation_file=None,
ardaidmatching_file=None):
""" Recasts an full ecospold dataset into normalized symmetric matrices
Args:
-----
* fileformats : List of file formats in which to save data
[Default: None, save to all possible file formats]
Options: 'Pandas' --> pandas dataframes
'csv' --> text with separator = '|'
'SparsePandas' --> sparse pandas dataframes
'SparseMatrix' --> scipy AND matlab sparse
'SparseMatrixForArda' --> with special
background
variable names
* with_absolut_flow: If true, produce not only coefficient matrices (A
and F) but also scale them up to production
volumes to get absolute flows in separate
matrices. [default: false]
* lci_check : If true, and if lci_dir is not None, parse cummulative
lifecycle inventory data as self.E matrix (str x pro),
and use it for sanity check against calculated
cummulative LCI
* rtol : Initial (max) relative tolerance for comparing E with
calculated E
* atol : Initial (max) absolute tolerance for comparing E with
calculated E
* characterisation_file: name of file containing characterisation
factors
* ardaidmatching_file: name of file matching Arda Ids, Ecoinvent2 DSIDs
and ecoinvent3 UUIDs. Only useful for the Arda
project.
Generates:
----------
* Intermediate data: products, activities, flows, labels
* A matrix: Normalized, intermediate exchange Leontief coefficients
(pro x pro)
* F matrix: Normalized extensions, factor requirements (elementary
exchanges) for each process (str x pro)
* E matrix: [optionally] cummulative normalized lci data (str x pro)
(for quality check)
Returns:
-------
* None, save all matrices in the object, and to file
"""
# Read in system description
self.extract_products()
self.extract_activities()
self.get_flows()
self.get_labels()
# Clean up if necessary
self.__find_unsourced_flows()
if self.unsourced_flows is not None:
self.__fix_flow_sources()
self.__fix_missing_activities()
# Once all is well, add extra info to PRO and STR, and order nicely
self.complement_labels()
# Finally, assemble normalized, symmetric matrices
self.build_AF()
if with_absolute_flows:
self.scale_up_AF()
if characterisation_file is not None:
print("starting characterisation")
self.process_inventory_elementary_flows()
self.read_characterisation(characterisation_file)
self.populate_complementary_tables()
self.characterize_flows()
self.generate_characterized_extensions()
if ardaidmatching_file:
self.make_compatible_with_arda(ardaidmatching_file)
# Save system to file
self.save_system(fileformats)
# Read/load lci cummulative emissions and perform quality check
if lci_check:
self.get_cummulative_lci()
self.cummulative_lci_check(rtol, atol, imax)
self.log.info('Done running ecospold2matrix.ecospold_to_Leontief')
def ecospold_to_sut(self, fileformats=None, make_untraceable=False):
""" Recasts an unallocated ecospold dataset into supply and use tables
Args:
-----
* fileformats : List of file formats in which to save data
[Default: None, save to all possible file formats]
Options: 'Pandas' --> pandas dataframes
'SparsePandas' --> sparse pandas dataframes,
'SparseMatrix' --> scipy AND matlab sparse
'csv' --> text files
* make_untraceable: Whether or not to aggregate away the source
activity dimension, yielding a use table in which
products are no longer linked to their providers
[default: False; don't do it]
Generates:
----------
* Intermediate data: Products, activities, flows, labels
* V table Matrix of supply of product by activities
* U table Matrix of use of products by activities
(recorded for a given supply amount, from V)
* G_act Matrix of factor use by activities
(recorded for a given supply amount, from V)
* V_prodVol Matrix of estimated real production volumes,
arranged as suply table (potentially useful
to rescale U, V and G)
Returns:
-------
* None, save all matrices in the object, and to file
"""
# Extract data on producs and activities
self.extract_products()
self.extract_activities()
# Extract or load data on flows and labels
self.get_flows()
self.get_labels()
self.complement_labels()
# Arrange as supply and use
self.build_sut(make_untraceable)
# Save to file
self.save_system(fileformats)
self.log.info("Done running ecospold2matrix.ecospold_to_sut")
# =========================================================================
# INTERMEDIATE WRAPPER METHODS: parse or load data + pickle results or not
def get_flows(self):
""" Wrapper: load from pickle or call extract_flows() to read ecospold
files.
Behavious determined by:
------------------------
prefer_pickles: Whether or not to load flow lists from previous run
instead of (re)reading XML Ecospold files
save_interm: Whether or not to pickle flows to file for use in
another project run.
Generates:
----------
self.inflows
self.outflows
self.elementary_flows
Returns:
--------
None, only defines within object
"""
filename = os.path.join(self.sys_dir, 'flows.pickle')
# EITHER LOAD FROM PREVIOUS ROUND...
if self.prefer_pickles and os.path.exists(filename):
# Read all flows
with open(filename, 'rb') as f:
[self.inflows,
self.elementary_flows,
self.outflows] = pickle.load(f)
# Log event
sha1 = self.__hash_file(f)
msg = "{} loaded from {} with SHA-1 of {}"
self.log.info(msg.format('Flows', filename, sha1))
# ...OR EXTRACT FROM ECOSPOLD DATA..
else:
self.extract_flows()
# optionally, pickle for further use
if self.save_interm:
with open(filename, 'wb') as f:
pickle.dump([self.inflows,
self.elementary_flows,
self.outflows], f)
# Log event
sha1 = self.__hash_file(filename)
msg = "{} saved in {} with SHA-1 of {}"
self.log.info(msg.format('Flows', filename, sha1))
def get_labels(self):
"""
Wrapper: load from pickle, or call methods to build labels from scratch
Behaviour determined by:
------------------------
* prefer_pickles: Whether or not to load flow lists from previous run
instead of (re)reading XML Ecospold files
* save_interm: Whether or not to pickle flows to file for use in
another project run.
Generates:
----------
* PRO: metadata on each process, i.e. production of each product
by each activity.
* STR: metadata on each stressor (or elementary exchange, factor of
production)
Returns:
--------
* None, only defines within object
NOTE:
-----
* At this stage, labels are at the strict minimum (ID, name) to
facilitate the addition of new processes or stressors, if need be, to
"patch" inconsistencies in the dataset. Once all is sorted out, more
data from product, activities, and elementary_flow descriptions are
added to the labels in self.complement_labels()
"""
filename = os.path.join(self.sys_dir, 'rawlabels.pickle')
# EITHER LOAD FROM PREVIOUS ROUND...
if self.prefer_pickles and os.path.exists(filename):
# Load from pickled file
with open(filename, 'rb') as f:
self.PRO, self.STR = pickle.load(f)
# Log event
sha1 = self.__hash_file(f)
msg = "{} loaded from {} with SHA-1 of {}"
self.log.info(msg.format('Labels', filename, sha1))
# OR EXTRACT FROM ECOSPOLD DATA...
else:
self.build_PRO()
self.build_STR()
# and optionally pickle for further use
if self.save_interm:
with open(filename, 'wb') as f:
pickle.dump([self.PRO, self.STR], f)
# Log event
sha1 = self.__hash_file(filename)
msg = "{} saved in {} with SHA-1 of {}"
self.log.info(msg.format('Labels', filename, sha1))
def get_cummulative_lci(self):
""" Wrapper: load from pickle or call build_E() to read ecospold files.
Behaviour determined by:
------------------------
* prefer_pickles: Whether or not to load flow lists from previous run
instead of (re)reading XML Ecospold files
* save_interm: Whether or not to pickle flows to file for use in
another project run.
* lci_dir: Directory where cummulative LCI ecospold are
Generates:
----------
* E: cummulative LCI emissions matrix
Returns:
--------
* None, only defines within object
"""
filename = os.path.join(self.lci_dir, 'lci.pickle')
# EITHER LOAD FROM PREVIOUS ROUND...
if self.prefer_pickles and os.path.exists(filename):
with open(filename, 'rb') as f:
self.E = pickle.load(f)
# log event
sha1 = self.__hash_file(f)
msg = "{} loaded from {} with SHA-1 of {}"
self.log.info(msg.format('Cummulative LCI', filename, sha1))
# OR BUILD FROM ECOSPOLD DATA...
else:
self.build_E()
# optionally, pickle for further use
if self.save_interm:
with open(filename, 'wb') as f:
pickle.dump(self.E, f)
# log event
sha1 = self.__hash_file(filename)
msg = "{} saved in {} with SHA-1 of {}"
self.log.info(msg.format('Cummulative LCI', filename, sha1))
# =========================================================================
# PARSING METHODS: the hard work with xml files
def extract_products(self):
""" Parses INTERMEDIATEEXCHANGE file to extract core data on products:
Id's, name, unitID, unitName.
Args: None
----
Returns: None
-------
Generates: self.products
----------
Credit:
------
This function incorporates/adapts code from Brightway2data, i.e., the
method extract_technosphere_metadata from class Ecospold2DataExtractor
"""
# The file to parse
fp = os.path.join(self.sys_dir, 'MasterData', self.__INTERMEXCHANGE)
assert os.path.exists(fp), "Can't find " + self.__INTERMEXCHANGE
def extract_metadata(o):
""" Subfunction to get the data from lxml root object """
# Get list of id, name, unitId, and unitName for all intermediate
# exchanges
return {'productName': o.name.text,
'unitName': o.unitName.text,
'productId': o.get('id'),
'unitId': o.get('unitId')}
# Parse XML file
with open(fp, 'r', encoding="utf-8") as fh:
root = objectify.parse(fh).getroot()
pro_list = [extract_metadata(ds) for ds in root.iterchildren()]
# Convert this list into a dataFrame
self.products = pd.DataFrame(pro_list)
self.products.index = self.products['productId']
# Log event
sha1 = self.__hash_file(fp)
msg = "Products extracted from {} with SHA-1 of {}"
self.log.info(msg.format(self.__INTERMEXCHANGE, sha1))
def extract_activities(self):
""" Parses ACTIVITYINDEX file to extract core data on activities:
Id's, activity type, startDate, endDate
Args: None
----
Returns: None
--------
Generates: self.activities
---------
"""
# Parse XML file describing activities
activity_file = os.path.join(self.sys_dir,
'MasterData',
self.__ACTIVITYINDEX)
root = ET.parse(activity_file).getroot()
# Get list of activities and their core attributes
act_list = []
for act in root:
act_list.append([act.attrib['id'],
act.attrib['activityNameId'],
act.attrib['specialActivityType'],
act.attrib['startDate'],
act.attrib['endDate']])
# Remove any potential duplicates
act_list, _, _, _ = self.__deduplicate(act_list, 0, 'activity_list')
# Convert to dataFrame
self.activities = pd.DataFrame(act_list,
columns=('activityId',
'activityNameId',
'activityType',
'startDate',
'endDate'),
index=[row[0] for row in act_list])
self.activities['activityType'
] = self.activities['activityType'].astype(int)
# Log event
sha1 = self.__hash_file(activity_file)
msg = "{} extracted from {} with SHA-1 of {}"
self.log.info(msg.format('Activities', self.__ACTIVITYINDEX, sha1))
def extract_flows(self):
""" Extracts of all intermediate and elementary flows
Args: None
----
Returns: None
-------
Generates:
----------
self.inflows: normalized product (intermediate) inputs
self.elementary_flows: normalized elementary flows
self.outflows: normalized product (intermediate) outputs
"""
# Initialize empty lists
inflow_list = []
outflow_list = []
elementary_flows = []
# Get list of ecoSpold files to process
data_folder = os.path.join(self.sys_dir, 'datasets')
spold_files = glob.glob(os.path.join(data_folder, '*.spold'))
# Log event
self.log.info('Processing {} files in {}'.format(len(spold_files),
data_folder))
# ONE FILE AT A TIME
for sfile in spold_files:
# Get activityId from file name
current_file = os.path.basename(sfile)
current_id = os.path.splitext(current_file)[0]
# For each file, find flow data
root = ET.parse(sfile).getroot()
child_ds = root.find(self.__PRE + 'childActivityDataset')
if child_ds is None:
child_ds = root.find(self.__PRE + 'activityDataset')
flow_ds = child_ds.find(self.__PRE + 'flowData')
# GO THROUGH EACH FLOW IN TURN
for entry in flow_ds:
# Get magnitude of flow
try:
_amount = float(entry.attrib.get('amount'))
except:
# Get ID of failed amount
_fail_id = entry.attrib.get('elementaryExchangeId',
'not found')
if _fail_id == 'not found':
_fail_id = entry.attrib.get('intermediateExchangeId',
'not found')
# Log failure
self.log.warn("Parser warning: flow in {0} cannot be"
" converted' 'to float. Id: {1} - amount:"
" {2}".format(str(current_file),
_fail_id,
_amount))
continue
if _amount == 0: # Ignore entries of magnitude zero
continue
# GET OBJECT, DESTINATION AND/OR ORIGIN OF EACH FLOW
# ... for elementary flows
if entry.tag == self.__PRE + 'elementaryExchange':
elementary_flows.append([
current_id,
entry.attrib.get('elementaryExchangeId'),
_amount])
elif entry.tag == self.__PRE + 'intermediateExchange':
# ... or product use
if entry.find(self.__PRE + 'inputGroup') is not None:
inflow_list.append([
current_id,
entry.attrib.get('activityLinkId'),
entry.attrib.get('intermediateExchangeId'),
_amount])
# ... or product supply.
elif entry.find(self.__PRE + 'outputGroup') is not None:
outflow_list.append([
current_id,
entry.attrib.get('intermediateExchangeId'),
_amount,
entry.attrib.get('productionVolumeAmount'),
entry.find(self.__PRE + 'outputGroup').text])
# Check for duplicates in outputflows
# there should really only be one output flow per activity
outflow_list, _, _, _ = self.__deduplicate(outflow_list,
0,
'outflow_list')
# CONVERT TO DATAFRAMES
self.inflows = pd.DataFrame(inflow_list, columns=['fileId',
'sourceActivityId',
'productId',
'amount'])
self.elementary_flows = pd.DataFrame(elementary_flows,
columns=['fileId',
'elementaryExchangeId',
'amount'])
out = pd.DataFrame(outflow_list,
columns=['fileId',
'productId',
'amount',
'productionVolume',
'outputGroup'],
index=[row[0] for row in outflow_list])
out['productionVolume'] = out['productionVolume'].astype(float)
out['outputGroup'] = out['outputGroup'].astype(int)
self.outflows = out
def build_STR(self):
""" Parses ElementaryExchanges.xml to builds stressor labels
Args: None
----
Behaviour influenced by:
------------------------
* self.STR_order: Determines how labels are ordered
Returns: None
-------
Generates: self.STR: DataFrame with stressor Id's for index
----------
Credit:
-------
This function incorporates/adapts code from Brightway2data, that is,
the classmethod extract_biosphere_metadata from Ecospold2DataExtractor
"""
# File to parse
fp = os.path.join(self.sys_dir, 'MasterData', self.__ELEXCHANGE)
assert os.path.exists(fp), "Can't find ElementaryExchanges.xml"
def extract_metadata(o):
""" Subfunction to extract data from lxml root object """
return {
'id': o.get('id'),
'name': o.name.text,
'unit': o.unitName.text,
'cas': o.get('casNumber'),
'comp': o.compartment.compartment.text,
'subcomp': o.compartment.subcompartment.text
}
# Extract data from file
with open(fp, 'r', encoding="utf-8") as fh:
root = objectify.parse(fh).getroot()
el_list = [extract_metadata(ds) for ds in root.iterchildren()]
# organize in pandas DataFrame
STR = pd.DataFrame(el_list)
STR.index = STR['id']
STR = STR.reindex_axis(['name',
'unit',
'cas',
'comp',
'subcomp'], axis=1)
self.STR = STR.sort(columns=self.STR_order)
# Log event
sha1 = self.__hash_file(fp)
msg = "{} extracted from {} with SHA-1 of {}"
self.log.info(msg.format('Elementary flows', self.__ELEXCHANGE, sha1))
def build_PRO(self):
""" Builds minimalistic intermediate exchange process labels
This functions parses all files in dataset folder. The list is
returned as pandas DataFrame. The index of the DataFrame is the
filename of the files in the DATASET folder.
Args: None
----
Behaviour influenced by:
------------------------
* self.PRO_order: Determines how labels are ordered
Returns: None
-------
Generates: self.PRO: DataFrame with file_Id's for index
----------
"""