-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
114 lines (95 loc) · 3.32 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 6 02:21:31 2017
@author: shrey
"""
import calendar
import collections
from time import strptime
from six import string_types
from lxml import etree
from itertools import chain
def read_xml(path):
"""
Parse tree from given XML path
"""
try:
tree = etree.parse(path)
except:
try:
tree = etree.fromstring(path)
except Exception as e:
print("Error: it was not able to read a path, a file-like object, or a string as an XML")
raise
return tree
def stringify_children(node):
"""
Filters and removes possible Nones in texts and tails
ref: http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
"""
parts = ([node.text] +
list(chain(*([c.text, c.tail] for c in node.getchildren()))) +
[node.tail])
return ''.join(filter(None, parts))
def stringify_affiliation(node):
"""
Filters and removes possible Nones in texts and tails
ref: http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
"""
parts = ([node.text] +
list(chain(*([c.text if (c.tag != 'label' and c.tag !='sup') else '', c.tail] for c in node.getchildren()))) +
[node.tail])
return ' '.join(filter(None, parts))
def stringify_affiliation_rec(node):
"""
Flatten and join list to string
ref: http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python
"""
parts = _recur_children(node)
parts_flatten = list(_flatten(parts))
return ' '.join(parts_flatten).strip()
def _flatten(l):
"""
Flatten list into one dimensional
"""
for el in l:
if isinstance(el, collections.Iterable) and not isinstance(el, string_types):
for sub in _flatten(el):
yield sub
else:
yield el
def _recur_children(node):
"""
Recursive through node to when it has multiple children
"""
if len(node.getchildren()) == 0:
parts = ([node.text or ''] + [node.tail or '']) if (node.tag != 'label' and node.tag !='sup') else ([node.tail or ''])
return parts
else:
parts = ([node.text or ''] +
[_recur_children(c) for c in node.getchildren()] +
[node.tail or ''])
return parts
def month_or_day_formater(month_or_day):
"""
Parameters
----------
month_or_day: str or int
must be one of the following:
(i) month: a three letter month abbreviation, e.g., 'Jan'.
(ii) day: an integer.
Returns
-------
numeric: str
a month of the form 'MM' or a day of the form 'DD'.
Note: returns None if:
(a) the input could not be mapped to a known month abbreviation OR
(b) the input was not an integer (i.e., a day).
"""
if month_or_day.replace(".", "") in filter(None, calendar.month_abbr):
to_format = strptime(month_or_day.replace(".", ""),'%b').tm_mon
elif month_or_day.strip().isdigit() and "." not in str(month_or_day):
to_format = int(month_or_day.strip())
else:
return None
return ("0" if to_format < 10 else "") + str(to_format)