Skip to content

Commit

Permalink
Merge pull request #245 from aaccomazzi/master
Browse files Browse the repository at this point in the history
implemented checking of canonical bibcodes in canonicalize_records()
  • Loading branch information
spacemansteve authored Feb 24, 2020
2 parents 38bf42d + 3bebfeb commit 02f9e79
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 39 deletions.
2 changes: 0 additions & 2 deletions aip/classic/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
#!/usr/bin/env python
import sys
sys.path.append('/proj/ads/soft/python/lib/site-packages')


import ads
from ads.Looker import Looker

Expand Down
19 changes: 13 additions & 6 deletions aip/classic/read_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
try:
from ads.ADSCachedExports import ADSRecords, init_lookers_cache
from ads.ADSCachedExports import LOGGER as export_logger
from aip.classic import conversions
from .conversions import ConvertBibcodes
except ImportError:
sys.path.append('/proj/ads/soft/python/lib/site-packages') #TODO: make it configurable
try:
from ads.ADSCachedExports import ADSRecords, init_lookers_cache
from ads.ADSCachedExports import LOGGER as export_logger
from aip.classic import conversions
from .conversions import ConvertBibcodes
INIT_LOOKERS_CACHE = init_lookers_cache
except ImportError:
print "Unable to import ads.ADSExports.ADSRecords!"
Expand All @@ -37,13 +37,18 @@
logger = utils.setup_logging('read_records')


def canonicalize_records(records, targets=None, ignore_fingerprints=False):
def canonicalize_records(records, targets=None, ignore_fingerprints=False, force_canonical=False):
'''
Takes a dict of {bibcode:fingerprint} and resolves each bibcode to its canonical.
Finds all alternates associated with that bibcode and constructs the full JSON_fingerprint
from all of these associated records
If force_canonical is set to True, check that the returned list of results contains bibcodes
which are present in the original records array; this ensures that every input record will
be processed and avoids a situation in which a mapping of a bibcode to its canonical form
produces an unknown bibcode further downstream (due to premature index mapping). [AA 2/18/20]
Note: Pops from the input dict with no attempt to copy/deepcopy it.
'''

Expand All @@ -53,14 +58,16 @@ def canonicalize_records(records, targets=None, ignore_fingerprints=False):

if not targets:
targets = records
Converter = conversions.ConvertBibcodes()
Converter = ConvertBibcodes()
for bibcode,fingerprint in targets.iteritems():
fingerprints = [fingerprint] #Start constructing the "full" fingerprint
#Check if there is a canonical
canonical=Converter.Canonicalize([bibcode])[0]
canonical = Converter.Canonicalize([bibcode])[0]
#And make sure that there is a canonical record if check_canonical, otherwise keep this bibcode
if force_canonical is False and canonical not in records:
canonical = bibcode
#If we are operating on the canonical, aggregate all of its alternates to form the "full" fingerprint
if canonical == bibcode:
# TODO(rca): decide what to do with canonical != bibcode
if ignore_fingerprints:
results.append((canonical, 'ignore'))
else:
Expand Down
10 changes: 9 additions & 1 deletion run.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,12 @@ def main(*args):
help='ignore json fingerprints when finding new records to update (ie, force update)'
)

parser.add_argument('--force-canonical', default=False,
action='store_true',
dest='force_canonical',
help='only process records with a valid canonical bibcode in input list'
)

parser.add_argument('--process-deletions', default=False,
action='store_true', dest='process_deletions',
help='Find orphaned bibcodes in the storage, then send these bibcodes to delete via rabbitMQ. No updates will be processed with this flag is set.'
Expand Down Expand Up @@ -296,7 +302,9 @@ def main(*args):
return

# TODO(rca): getAlternates is called multiple times unnecessarily
records = read_records.canonicalize_records(records, targets or records, ignore_fingerprints=args.ignore_json_fingerprints)
records = read_records.canonicalize_records(records, targets or records,
ignore_fingerprints=args.ignore_json_fingerprints,
force_canonical=args.force_canonical)
logger.info('Canonicalize %s records', len(records))

if args.replay_deletions:
Expand Down
133 changes: 103 additions & 30 deletions tests/classic/test_read_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,43 +13,116 @@
ADSRecords = None
print "Warning: Fallback to explicit path declaration for import"

from aip.classic import read_records
# used to mock the canonicalization of records
CANONICALDICT = {
'2014arXiv1401.2993T': '2014MNRAS.439.1884T',
'2014MNRAS.439.1884T': '2014MNRAS.439.1884T',

'2013arXiv1306.3186H': '2013MNRAS.434.1889H',
'2013MNRAS.434.1889H': '2013MNRAS.434.1889H',

'1978Natur.275..624M': '1978Natur.275..624M',

'1988ESASP.281b.287G': '1988ESASP.281b.287G',
'1988IUE88...2..287G': '1988ESASP.281b.287G',
'1988IUES....1..287G': '1988ESASP.281b.287G',
'1988uvai....2..287G': '1988ESASP.281b.287G',

'2014PhRvD..90d4013F': '2014PhRvD..90d4013F',
'2013arXiv1311.6899F': '2014PhRvD..90d4013F',

'2020slow.bibcode...': '2020fake.canonical.'
}

# a sample of real-world records (except for last one) with fake fingerprints
RECORDS = OrderedDict([
('2014arXiv1401.2993T','b'), #This is an alternate to 'f'
('2014MNRAS.439.1884T','f'), #This is the canonical of 'b'

('2013MNRAS.434.1889H','d'), #This is the canonical of 'g'
('2013arXiv1306.3186H','g'), #This is the alternate of 'd'

('1978Natur.275..624M','c'), #No alternates, already canonical

('1988ESASP.281b.287G','x1'), #Canonical, the following are alternates
('1988IUE88...2..287G','a1'),
('1988IUES....1..287G','a2'),
('1988uvai....2..287G','a3'),

('2014PhRvD..90d4013F','h'), #This is the canonical of 'h'
('2013arXiv1311.6899F','k'), #This it the alternate of 'k'

('2020slow.bibcode...','fake') #This one has no canonical
])

# this is what read_records should return for the records above
# when run in strict mode
EXPECTED_STRICT = [
('2014MNRAS.439.1884T', 'b;f'),
('2013MNRAS.434.1889H', 'd;g'),
('1978Natur.275..624M', 'c'),
('1988ESASP.281b.287G','a1;a2;a3;x1'),
('2014PhRvD..90d4013F','h;k'),
]

# results when not run in strict mode
EXPECTED = EXPECTED_STRICT + [('2020slow.bibcode...','fake')]


class TestCanonical(unittest.TestCase):

# here we mock the entire conversion class so that we can use
# the fake canonical mappings in CANONICALDICT
class mock_ConvertBibcodes(object):
def __init__(self):
# create inverse mapping
self.altdict = dict()
_ = [ (v,k) for (k,v) in CANONICALDICT.items() ]
for (k,v) in _:
if not k: continue
if k == v: continue
self.altdict.setdefault(k, [])
self.altdict[k].append(v)

def Canonicalize(self, biblist, remove_matches=False):
newlist = []
for bibcode in biblist:
res = CANONICALDICT.get(bibcode)
if res:
bibcode = res
newlist.append(bibcode)
return list(set(newlist))

def getAlternates(self, bibcode):
return self.altdict.get(bibcode, [])

def test_getalternates(self):
from aip.classic import read_records
import copy
if not hasattr(read_records, 'ConvertBibcodes'):
read_records.ConvertBibcodes = self.mock_ConvertBibcodes
# here we have to prevent the import of ads and ads.Looker needed by aip.classic.conversions
# so we can mock the conversion of bibcodes. Ugly as hell but it works
with mock.patch.dict(sys.modules, { 'ads': mock.Mock(), 'ads.Looker': mock.Mock() } ), \
mock.patch('aip.classic.conversions.ConvertBibcodes', return_value=self.mock_ConvertBibcodes):
records = copy.deepcopy(RECORDS)
results = read_records.canonicalize_records(records)
self.assertEqual(results, EXPECTED)
records = copy.deepcopy(RECORDS)
results = read_records.canonicalize_records(records, force_canonical=True)
self.assertEqual(results, EXPECTED_STRICT)


class TestADSExports(unittest.TestCase):

@unittest.skipIf(not ADSRecords, "ads.ADSCachedExports not available")
def test_canonicalize_records(self):
from aip.classic import read_records

records = OrderedDict([
('2014arXiv1401.2993T','b'), #This is an alternate to 'f'
('2014MNRAS.439.1884T','f'), #This is the canonical of 'b'

('2013MNRAS.434.1889H','d'), #This is the canonical of 'g'
('2013arXiv1306.3186H','g'), #This is the alternate of 'd'

('1978Natur.275..624M','c'), #No alternates, already canonical

('1988ESASP.281b.287G','x1'), #Canonical, the following are alternates
('1988IUE88...2..287G','a1'),
('1988IUES....1..287G','a2'),
('1988uvai....2..287G','a3'),

('2014PhRvD..90d4013F','h'), #This is the canonical of 'h'
('2013arXiv1311.6899F','k'), #This it the alternate of 'k'
])
expected = [
('2014MNRAS.439.1884T', 'b;f'),
('2013MNRAS.434.1889H', 'd;g'),
('1978Natur.275..624M', 'c'),
('1988ESASP.281b.287G','a1;a2;a3;x1'),
('2014PhRvD..90d4013F','h;k'),
]

results = read_records.canonicalize_records(OrderedDict((k,v) for k,v in records.iteritems()))
self.assertEqual(results, expected)

results = read_records.canonicalize_records(RECORDS)
self.assertEqual(results, EXPECTED)

def test_readRecordsFromADSExports(self):
from aip.classic import read_records
if not hasattr(read_records, 'ADSRecords'):
read_records.ADSRecords = {}

Expand Down

0 comments on commit 02f9e79

Please sign in to comment.