Merge pull request #245 from aaccomazzi/master

implemented checking of canonical bibcodes in canonicalize_records()
adsabs · Feb 24, 2020 · 02f9e79 · 02f9e79
2 parents 38bf42d + 3bebfeb
commit 02f9e79
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 39 deletions.
diff --git a/aip/classic/conversions.py b/aip/classic/conversions.py
@@ -2,8 +2,6 @@
 #!/usr/bin/env python
 import sys
 sys.path.append('/proj/ads/soft/python/lib/site-packages')
-
-
 import ads
 from ads.Looker import Looker
 

diff --git a/aip/classic/read_records.py b/aip/classic/read_records.py
@@ -21,13 +21,13 @@
 try:
     from ads.ADSCachedExports import ADSRecords, init_lookers_cache
     from ads.ADSCachedExports import LOGGER as export_logger
-    from aip.classic import conversions
+    from .conversions import ConvertBibcodes
 except ImportError:
     sys.path.append('/proj/ads/soft/python/lib/site-packages') #TODO: make it configurable
     try:
         from ads.ADSCachedExports import ADSRecords, init_lookers_cache
         from ads.ADSCachedExports import LOGGER as export_logger
-        from aip.classic import conversions
+        from .conversions import ConvertBibcodes
         INIT_LOOKERS_CACHE = init_lookers_cache
     except ImportError:
         print "Unable to import ads.ADSExports.ADSRecords!"
@@ -37,13 +37,18 @@
 logger = utils.setup_logging('read_records')
 
 
-def canonicalize_records(records, targets=None, ignore_fingerprints=False):
+def canonicalize_records(records, targets=None, ignore_fingerprints=False, force_canonical=False):
     '''
     Takes a dict of {bibcode:fingerprint} and resolves each bibcode to its canonical.
     
     Finds all alternates associated with that bibcode and constructs the full JSON_fingerprint
     from all of these associated records
     
+    If force_canonical is set to True, check that the returned list of results contains bibcodes
+    which are present in the original records array; this ensures that every input record will
+    be processed and avoids a situation in which a mapping of a bibcode to its canonical form
+    produces an unknown bibcode further downstream (due to premature index mapping). [AA 2/18/20]
+    
     Note: Pops from the input dict with no attempt to copy/deepcopy it.
     '''
 
@@ -53,14 +58,16 @@ def canonicalize_records(records, targets=None, ignore_fingerprints=False):
 
     if not targets:
         targets = records
-    Converter = conversions.ConvertBibcodes()
+    Converter = ConvertBibcodes()
     for bibcode,fingerprint in targets.iteritems():
         fingerprints = [fingerprint] #Start constructing the "full" fingerprint
         #Check if there is a canonical
-        canonical=Converter.Canonicalize([bibcode])[0]
+        canonical = Converter.Canonicalize([bibcode])[0]
+        #And make sure that there is a canonical record if check_canonical, otherwise keep this bibcode
+        if force_canonical is False and canonical not in records:
+            canonical = bibcode
         #If we are operating on the canonical, aggregate all of its alternates to form the "full" fingerprint
         if canonical == bibcode:
-            # TODO(rca): decide what to do with canonical != bibcode
             if ignore_fingerprints:
                 results.append((canonical, 'ignore'))
             else:

diff --git a/run.py b/run.py
@@ -161,6 +161,12 @@ def main(*args):
                         help='ignore json fingerprints when finding new records to update (ie, force update)'
                         )
 
+    parser.add_argument('--force-canonical', default=False,
+                        action='store_true',
+                        dest='force_canonical',
+                        help='only process records with a valid canonical bibcode in input list'
+                        )
+
     parser.add_argument('--process-deletions', default=False,
                         action='store_true', dest='process_deletions',
                         help='Find orphaned bibcodes in the storage, then send these bibcodes to delete via rabbitMQ. No updates will be processed with this flag is set.'
@@ -296,7 +302,9 @@ def main(*args):
                 return
 
         # TODO(rca): getAlternates is called multiple times unnecessarily
-        records = read_records.canonicalize_records(records, targets or records, ignore_fingerprints=args.ignore_json_fingerprints)
+        records = read_records.canonicalize_records(records, targets or records, 
+                                                    ignore_fingerprints=args.ignore_json_fingerprints,
+                                                    force_canonical=args.force_canonical)
         logger.info('Canonicalize %s records', len(records))
 
         if args.replay_deletions:

diff --git a/tests/classic/test_read_records.py b/tests/classic/test_read_records.py
@@ -13,43 +13,116 @@
     ADSRecords = None
     print "Warning: Fallback to explicit path declaration for import"
 
-from aip.classic import read_records
+# used to mock the canonicalization of records
+CANONICALDICT = {
+    '2014arXiv1401.2993T': '2014MNRAS.439.1884T',
+    '2014MNRAS.439.1884T': '2014MNRAS.439.1884T',
+
+    '2013arXiv1306.3186H': '2013MNRAS.434.1889H', 
+    '2013MNRAS.434.1889H': '2013MNRAS.434.1889H', 
+
+    '1978Natur.275..624M': '1978Natur.275..624M',
+
+    '1988ESASP.281b.287G': '1988ESASP.281b.287G',
+    '1988IUE88...2..287G': '1988ESASP.281b.287G',
+    '1988IUES....1..287G': '1988ESASP.281b.287G',
+    '1988uvai....2..287G': '1988ESASP.281b.287G',
+
+    '2014PhRvD..90d4013F': '2014PhRvD..90d4013F',
+    '2013arXiv1311.6899F': '2014PhRvD..90d4013F',
+
+    '2020slow.bibcode...': '2020fake.canonical.'
+    }
+
+# a sample of real-world records (except for last one) with fake fingerprints
+RECORDS = OrderedDict([
+        ('2014arXiv1401.2993T','b'), #This is an alternate to 'f'
+        ('2014MNRAS.439.1884T','f'), #This is the canonical of 'b'
+
+        ('2013MNRAS.434.1889H','d'), #This is the canonical of 'g'
+        ('2013arXiv1306.3186H','g'), #This is the alternate of 'd'
+
+        ('1978Natur.275..624M','c'), #No alternates, already canonical
+
+        ('1988ESASP.281b.287G','x1'), #Canonical, the following are alternates
+        ('1988IUE88...2..287G','a1'),
+        ('1988IUES....1..287G','a2'),
+        ('1988uvai....2..287G','a3'),
+
+        ('2014PhRvD..90d4013F','h'), #This is the canonical of 'h'
+        ('2013arXiv1311.6899F','k'), #This it the alternate of 'k'
+
+        ('2020slow.bibcode...','fake') #This one has no canonical
+        ])
+
+# this is what read_records should return for the records above
+# when run in strict mode
+EXPECTED_STRICT =  [
+    ('2014MNRAS.439.1884T', 'b;f'),
+    ('2013MNRAS.434.1889H', 'd;g'),
+    ('1978Natur.275..624M', 'c'),
+    ('1988ESASP.281b.287G','a1;a2;a3;x1'),
+    ('2014PhRvD..90d4013F','h;k'),
+    ]
+
+# results when not run in strict mode
+EXPECTED = EXPECTED_STRICT + [('2020slow.bibcode...','fake')]
+
+
+class TestCanonical(unittest.TestCase):
+
+    # here we mock the entire conversion class so that we can use
+    # the fake canonical mappings in CANONICALDICT
+    class mock_ConvertBibcodes(object):
+        def __init__(self):
+            # create inverse mapping
+            self.altdict = dict()
+            _ = [ (v,k) for (k,v) in CANONICALDICT.items() ] 
+            for (k,v) in _:
+                if not k: continue
+                if k == v: continue
+                self.altdict.setdefault(k, [])
+                self.altdict[k].append(v)
+
+        def Canonicalize(self, biblist, remove_matches=False):
+            newlist = []
+            for bibcode in biblist:
+                res = CANONICALDICT.get(bibcode)
+                if res:
+                    bibcode = res
+                newlist.append(bibcode)
+            return list(set(newlist))
+
+        def getAlternates(self, bibcode):
+            return self.altdict.get(bibcode, [])
+
+    def test_getalternates(self):
+        from aip.classic import read_records
+        import copy
+        if not hasattr(read_records, 'ConvertBibcodes'):
+            read_records.ConvertBibcodes = self.mock_ConvertBibcodes
+        # here we have to prevent the import of ads and ads.Looker needed by aip.classic.conversions
+        # so we can mock the conversion of bibcodes.  Ugly as hell but it works
+        with mock.patch.dict(sys.modules, { 'ads': mock.Mock(), 'ads.Looker': mock.Mock() } ), \
+                mock.patch('aip.classic.conversions.ConvertBibcodes', return_value=self.mock_ConvertBibcodes):
+            records = copy.deepcopy(RECORDS)
+            results = read_records.canonicalize_records(records)
+            self.assertEqual(results, EXPECTED)
+            records = copy.deepcopy(RECORDS)
+            results = read_records.canonicalize_records(records, force_canonical=True)
+            self.assertEqual(results, EXPECTED_STRICT)
+
 
 class TestADSExports(unittest.TestCase):
 
     @unittest.skipIf(not ADSRecords, "ads.ADSCachedExports not available")
     def test_canonicalize_records(self):
         from aip.classic import read_records
-
-        records = OrderedDict([
-            ('2014arXiv1401.2993T','b'), #This is an alternate to 'f'
-            ('2014MNRAS.439.1884T','f'), #This is the canonical of 'b'
-
-            ('2013MNRAS.434.1889H','d'), #This is the canonical of 'g'
-            ('2013arXiv1306.3186H','g'), #This is the alternate of 'd'
-
-            ('1978Natur.275..624M','c'), #No alternates, already canonical
-
-            ('1988ESASP.281b.287G','x1'), #Canonical, the following are alternates
-            ('1988IUE88...2..287G','a1'),
-            ('1988IUES....1..287G','a2'),
-            ('1988uvai....2..287G','a3'),
-
-            ('2014PhRvD..90d4013F','h'), #This is the canonical of 'h'
-            ('2013arXiv1311.6899F','k'), #This it the alternate of 'k'
-          ])
-        expected =  [
-            ('2014MNRAS.439.1884T', 'b;f'),
-            ('2013MNRAS.434.1889H', 'd;g'),
-            ('1978Natur.275..624M', 'c'),
-            ('1988ESASP.281b.287G','a1;a2;a3;x1'),
-            ('2014PhRvD..90d4013F','h;k'),
-          ]
-
-        results = read_records.canonicalize_records(OrderedDict((k,v) for k,v in records.iteritems()))
-        self.assertEqual(results, expected)
-
+        results = read_records.canonicalize_records(RECORDS)
+        self.assertEqual(results, EXPECTED)
+
     def test_readRecordsFromADSExports(self):
+        from aip.classic import read_records
         if not hasattr(read_records, 'ADSRecords'):
             read_records.ADSRecords = {}