From 715079e2529b0a9554f4eede1d1cf01cfb8847c5 Mon Sep 17 00:00:00 2001 From: Alberto Accomazzi Date: Thu, 11 Aug 2022 15:17:02 -0400 Subject: [PATCH 1/2] unique DOIs in the SOLR document --- aip/classic/solr_adapter.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/aip/classic/solr_adapter.py b/aip/classic/solr_adapter.py index df12f49..db02437 100644 --- a/aip/classic/solr_adapter.py +++ b/aip/classic/solr_adapter.py @@ -340,7 +340,7 @@ def _doctype_facet_hier(ADS_record): @staticmethod def _doi(ADS_record): result = [i['content'] for i in ADS_record['metadata']['general'].get('doi', [])] - return {'doi': result} + return {'doi': case_insensitive_unique_list(result)} @staticmethod def _eid(ADS_record): @@ -580,6 +580,17 @@ def validate(cls, solr_record): assert len(set([type(i) for i in v])) == 1, "{0}: multiple data-types in list: {1}".format(k, v) assert isinstance(v[0], type(SCHEMA[k][0])), "{0}: inner list element has unexpected type ({1}!={2}): {3}".format(k, type(v[0]), type(SCHEMA[k][0]), v) +def case_insensitive_unique_list(array): + """ + Returns the list of unique elements in the input array + in a case-insensitive way, preserving order and case + """ + seen, result = set(), [] + for item in array: + if item.lower() not in seen: + seen.add(item.lower()) + result.append(item) + return result def unroll_unique_list(array): """ From 7b84ef8ffcf3b9d3d989e665bf42d197044f573f Mon Sep 17 00:00:00 2001 From: Alberto Accomazzi Date: Tue, 16 Aug 2022 13:21:42 -0400 Subject: [PATCH 2/2] fix bug with merging of metadata blocks with multiple origins --- aip/classic/merger.py | 2 +- tests/classic/test_merger.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/aip/classic/merger.py b/aip/classic/merger.py index 41068ae..ad535bb 100644 --- a/aip/classic/merger.py +++ b/aip/classic/merger.py @@ -275,7 +275,7 @@ def _getOriginPriority(self,f,field): origins = f[1]['origin'].split('; ') o = origins.pop() for i in origins: - o = i if p.get(i.upper(),0) >= p.get(o1.upper(),0) else o + o = i if p.get(i.upper(),0) >= p.get(o.upper(),0) else o # if origin not defined, default to 'PUBLISHER' P = p.get(o.upper(),p.get('PUBLISHER',0)) return P diff --git a/tests/classic/test_merger.py b/tests/classic/test_merger.py index 337b11d..dfec66f 100644 --- a/tests/classic/test_merger.py +++ b/tests/classic/test_merger.py @@ -80,11 +80,13 @@ def test_doiMerger(self): B1 = {'tempdata':{'origin':'PUBLISHER','type':'general'}} B2 = {'tempdata':{'origin':'ARXIV','type':'general'}} B3 = {'tempdata':{'origin':'SIMBAD','type':'general'}} + B4 = {'tempdata':{'origin':'JST; CROSSREF','type':'general'}} # 2022NatAs...6..331D B1['doi'] = [ '10.1038/s41550-021-01558-y' ] B2['doi'] = [ '10.48550/arXiv.2201.05617' ] B3['doi'] = [ '10.1038/s41550-021-01558-y' ] + B4['doi'] = [ '10.1038/S41550-021-01558-Y' ] # note different case blocks = [B1,B2] m = merger.Merger(blocks) @@ -107,6 +109,14 @@ def test_doiMerger(self): expectedResults = { 'doi': [ '10.1038/s41550-021-01558-y', '10.48550/arXiv.2201.05617' ], 'altpublications': [] } self.assertEqual(results,expectedResults) + blocks = [B4,B2,B1,B3] + m = merger.Merger(blocks) + m.merge() + results = m.block + # the list gets eventually uniqued in solr_adapter + expectedResults = { 'doi': [ '10.1038/S41550-021-01558-Y', '10.1038/s41550-021-01558-y', '10.48550/arXiv.2201.05617' ], 'altpublications': [] } + self.assertEqual(results,expectedResults) + def test_want_datetime(self): m = merger.Merger([])