From 1aff3d42a25a7c7b5ee0208288446a4ab58dc65a Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Fri, 31 May 2024 16:14:12 -0400 Subject: [PATCH 1/2] Add final set of Datacite field methods Why these changes are being introduced: * Finish refactoring Datacite transform to use field methods How this addresses that need: * Add field methods and associated private methods for notes, publishers, related_items, rights, subjects, and summary * Add unit tests for new field methods Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-284 --- tests/sources/xml/test_datacite.py | 187 +++++++++++++++++++++++++ transmogrifier/sources/xml/datacite.py | 168 +++++++++++++--------- 2 files changed, 287 insertions(+), 68 deletions(-) diff --git a/tests/sources/xml/test_datacite.py b/tests/sources/xml/test_datacite.py index 18242bf..f649d6f 100644 --- a/tests/sources/xml/test_datacite.py +++ b/tests/sources/xml/test_datacite.py @@ -757,6 +757,193 @@ def test_get_locations_transforms_correctly_if_fields_missing(): assert Datacite.get_locations(source_record) is None +def test_get_notes_success(): + source_record = create_datacite_source_record_stub( + """ + Survey Data + + Stata, 13 + + """ + ) + assert Datacite.get_notes(source_record) == [ + Note(value=["Survey Data"], kind="Datacite resource type"), + Note(value=["Stata, 13"], kind="TechnicalInfo"), + ] + + +def test_get_notes_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub( + "" + ) + assert Datacite.get_notes(source_record) is None + + +def test_get_notes_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub() + assert Datacite.get_notes(source_record) is None + + +def test_get_publishers_success(): + source_record = create_datacite_source_record_stub( + "Harvard Dataverse" + ) + assert Datacite.get_publishers(source_record) == [Publisher(name="Harvard Dataverse")] + + +def test_get_publishers_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub("") + assert Datacite.get_publishers(source_record) is None + + +def test_get_publishers_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub() + assert Datacite.get_publishers(source_record) is None + + +def test_get_related_items_success(): + metadata_insert = ( + '10.1257/app.20150390' + '10.5281/zenodo.5524464' + '1234567.5524464' + "https://zenodo.org/communities/astronomy-general" + "" + ) + source_record = create_datacite_source_record_stub(metadata_insert) + assert Datacite.get_related_items(source_record) == [ + RelatedItem(relationship="IsCitedBy", uri="https://doi.org/10.1257/app.20150390"), + RelatedItem(relationship="IsVersionOf", uri="10.5281/zenodo.5524464"), + RelatedItem(relationship="Other", uri="1234567.5524464"), + RelatedItem( + relationship="IsPartOf", + uri="https://zenodo.org/communities/astronomy-general", + ), + ] + + +def test_get_related_items_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub( + "" + ) + assert Datacite.get_related_items(source_record) is None + + +def test_get_related_items_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub() + assert Datacite.get_related_items(source_record) is None + + +def test_get_rights_success(): + source_record = create_datacite_source_record_stub( + """ + + + CC0 1.0 + + """ + ) + assert Datacite.get_rights(source_record) == [ + Rights(description=None, kind=None, uri="info:eu-repo/semantics/openAccess"), + Rights( + description="CC0 1.0", + kind=None, + uri="http://creativecommons.org/publicdomain/zero/1.0", + ), + ] + + +def test_get_rights_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub( + "" + ) + assert Datacite.get_rights(source_record) is None + + +def test_get_rights_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub() + assert Datacite.get_rights(source_record) is None + + +def test_get_subjects_success(): + source_record = create_datacite_source_record_stub( + """ + + Social Sciences + Educational materials + Adult education, education inputs, field experiments + Education + + """ + ) + assert Datacite.get_subjects(source_record) == [ + Subject( + value=["Social Sciences", "Educational materials"], + kind="Subject scheme not provided", + ), + Subject( + value=[ + "Adult education, education inputs, field experiments", + "Education", + ], + kind="LCSH", + ), + ] + + +def test_get_subjects_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub("") + assert Datacite.get_subjects(source_record) is None + + +def test_get_subjects_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub() + assert Datacite.get_subjects(source_record) is None + + +def test_get_summary_success(): + metadata_insert = ( + 'Using a ' + "randomized field experiment in India, we evaluate the effectiveness of adult " + "literacy and parental involvement interventions in improving children's " + "learning. Households were assigned to receive either adult literacy (language " + "and math) classes for mothers, training for mothers on how to enhance their " + "children's learning at home, or a combination of the two programs. All three " + "interventions had significant but modest impacts on childrens math scores. The " + "interventions also increased mothers' test scores in both language and math, as " + "well as a range of other outcomes reflecting greater involvement of mothers in " + "their children's education." + ) + source_record = create_datacite_source_record_stub(metadata_insert) + assert Datacite.get_summary(source_record) == [ + "Using a randomized field experiment in India, we evaluate the effectiveness " + "of adult literacy and parental involvement interventions in improving " + "children's learning. Households were assigned to receive either adult " + "literacy (language and math) classes for mothers, training for mothers on " + "how to enhance their children's learning at home, or a combination of the " + "two programs. All three interventions had significant but modest impacts on " + "childrens math scores. The interventions also increased mothers' test scores" + " in both language and math, as well as a range of other outcomes reflecting " + "greater involvement of mothers in their children's education." + ] + + +def test_get_summarty_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub( + '' + ) + assert Datacite.get_summary(source_record) is None + + +def test_get_summary_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub() + assert Datacite.get_summary(source_record) is None + + def test_generate_name_identifier_url_orcid_scheme(datacite_record_all_fields): assert next(datacite_record_all_fields).contributors[0].identifier == [ "https://orcid.org/0000-0000-0000-0000" diff --git a/transmogrifier/sources/xml/datacite.py b/transmogrifier/sources/xml/datacite.py index e6e2339..bfde789 100644 --- a/transmogrifier/sources/xml/datacite.py +++ b/transmogrifier/sources/xml/datacite.py @@ -25,7 +25,6 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: oai_datacite XML. """ fields: dict = {} - source_record_id = self.get_source_record_id(source_record) # alternate_titles fields["alternate_titles"] = self.get_alternate_titles(source_record) @@ -64,83 +63,22 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: fields["locations"] = self.get_locations(source_record) # notes - if resource_type := source_record.metadata.find("resourceType", string=True): - fields.setdefault("notes", []).append( - timdex.Note( - value=[str(resource_type.string)], - kind="Datacite resource type", - ) - ) - descriptions = source_record.metadata.find_all("description", string=True) - for description in descriptions: - if "descriptionType" not in description.attrs: - logger.warning( - "Datacite record %s missing required Datacite attribute " - "@descriptionType", - source_record_id, - ) - if description.get("descriptionType") != "Abstract": - fields.setdefault("notes", []).append( - timdex.Note( - value=[description.string], - kind=description.get("descriptionType") or None, - ) - ) + fields["notes"] = self.get_notes(source_record) # publishers - if publisher := source_record.metadata.find("publisher", string=True): - fields["publishers"] = [timdex.Publisher(name=publisher.string)] - else: - logger.warning( - "Datacite record %s missing required Datacite field publisher", - source_record_id, - ) + fields["publishers"] = self.get_publishers(source_record) # related_items, uses related_identifiers retrieved for identifiers - for related_identifier in [ - ri - for ri in source_record.metadata.find_all("relatedIdentifier", string=True) - if ri.get("relationType") != "IsIdenticalTo" - ]: - fields.setdefault("related_items", []).append( - timdex.RelatedItem( - uri=self.generate_related_item_identifier_url(related_identifier), - relationship=related_identifier.get("relationType") - or "Not specified", - ) - ) + fields["related_items"] = self.get_related_items(source_record) # rights - for right in [ - r - for r in source_record.metadata.find_all("rights") - if r.string or r.get("rightsURI") - ]: - fields.setdefault("rights", []).append( - timdex.Rights( - description=right.string or None, uri=right.get("rightsURI") or None - ) - ) + fields["rights"] = self.get_rights(source_record) # subjects - subjects_dict: dict[str, list[str]] = {} - for subject in source_record.metadata.find_all("subject", string=True): - if not subject.get("subjectScheme"): - subjects_dict.setdefault("Subject scheme not provided", []).append( - subject.string - ) - else: - subjects_dict.setdefault(subject["subjectScheme"], []).append( - subject.string - ) - fields["subjects"] = [ - timdex.Subject(value=value, kind=key) for key, value in subjects_dict.items() - ] or None + fields["subjects"] = self.get_subjects(source_record) # summary - fields["summary"] = [ - d.string for d in descriptions if d.get("descriptionType") == "Abstract" - ] or None + fields["summary"] = self.get_summary(source_record) return fields @@ -425,6 +363,100 @@ def get_locations(cls, source_record: Tag) -> list[timdex.Location] | None: ) ] or None + @classmethod + def get_notes(cls, source_record: Tag) -> list[timdex.Note] | None: + notes = [] + notes.extend(list(cls._get_resource_type_note(source_record))) + notes.extend(list(cls._get_description_notes(source_record))) + return notes or None + + @classmethod + def _get_resource_type_note(cls, source_record: Tag) -> Iterator[timdex.Note]: + if resource_type := source_record.metadata.find("resourceType", string=True): + yield timdex.Note( + value=[str(resource_type.string)], + kind="Datacite resource type", + ) + + @classmethod + def _get_description_notes(cls, source_record: Tag) -> Iterator[timdex.Note]: + descriptions = source_record.metadata.find_all("description", string=True) + for description in descriptions: + if "descriptionType" not in description.attrs: + logger.warning( + "Datacite record %s missing required Datacite attribute " + "@descriptionType", + cls.get_source_record_id(source_record), + ) + if description.get("descriptionType") != "Abstract": + yield timdex.Note( + value=[str(description.string)], + kind=description.get("descriptionType") or None, + ) + + @classmethod + def get_publishers(cls, source_record: Tag) -> list[timdex.Publisher] | None: + publishers = [] + if publisher := source_record.metadata.find("publisher", string=True): + publishers.append(timdex.Publisher(name=str(publisher.string))) + else: + logger.warning( + "Datacite record %s missing required Datacite field publisher", + cls.get_source_record_id(source_record), + ) + return publishers or None + + @classmethod + def get_related_items(cls, source_record: Tag) -> list[timdex.RelatedItem] | None: + return [ + timdex.RelatedItem( + uri=cls.generate_related_item_identifier_url(related_identifier), + relationship=related_identifier.get("relationType") or "Not specified", + ) + for related_identifier in source_record.metadata.find_all( + "relatedIdentifier", string=True + ) + if related_identifier.get("relationType") != "IsIdenticalTo" + ] or None + + @classmethod + def get_rights(cls, source_record: Tag) -> list[timdex.Rights] | None: + return [ + timdex.Rights( + description=rights.string or None, + uri=rights.get("rightsURI") or None, + ) + for rights in source_record.metadata.find_all("rights") + if rights.string or rights.get("rightsURI") + ] or None + + @classmethod + def get_subjects(cls, source_record: Tag) -> list[timdex.Subject] | None: + subjects_dict: dict[str, list[str]] = {} + + for subject in source_record.metadata.find_all("subject", string=True): + if not subject.get("subjectScheme"): + subjects_dict.setdefault("Subject scheme not provided", []).append( + str(subject.string) + ) + else: + subjects_dict.setdefault(subject["subjectScheme"], []).append( + str(subject.string) + ) + + return [ + timdex.Subject(value=subject_value, kind=subject_scheme) + for subject_scheme, subject_value in subjects_dict.items() + ] or None + + @classmethod + def get_summary(cls, source_record: Tag) -> list[str] | None: + return [ + str(description.string) + for description in source_record.metadata.find_all("description", string=True) + if description.get("descriptionType") == "Abstract" + ] or None + @classmethod def get_main_titles(cls, source_record: Tag) -> list[str]: """ From aad8eb733a1391428e7e5518296f64eeb4ea154d Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Tue, 4 Jun 2024 12:07:23 -0400 Subject: [PATCH 2/2] Updates based on discussion in PR #189 * Remove unnecessary list pattern from get_content_types, get_languages, and get_publishers methods * Refactor _get_description_notes method to remove unnecessary var and add description_type var * Refactor get_subjects method to use defaultdict --- transmogrifier/sources/xml/datacite.py | 54 ++++++++++---------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/transmogrifier/sources/xml/datacite.py b/transmogrifier/sources/xml/datacite.py index bfde789..e7ee6b6 100644 --- a/transmogrifier/sources/xml/datacite.py +++ b/transmogrifier/sources/xml/datacite.py @@ -1,4 +1,5 @@ import logging +from collections import defaultdict from collections.abc import Iterator from bs4 import Tag # type: ignore[import-untyped] @@ -108,22 +109,18 @@ def _get_additional_titles( @classmethod def get_content_type(cls, source_record: Tag) -> list[str] | None: - content_types = [] if resource_type := source_record.metadata.find("resourceType"): if content_type := resource_type.get("resourceTypeGeneral"): if cls.valid_content_types([content_type]): - content_types.append(str(content_type)) - else: - message = f'Record skipped based on content type: "{content_type}"' - raise SkippedRecordEvent( - message, cls.get_source_record_id(source_record) - ) + return [str(content_type)] + message = f'Record skipped based on content type: "{content_type}"' + raise SkippedRecordEvent(message, cls.get_source_record_id(source_record)) else: logger.warning( "Datacite record %s missing required Datacite field resourceType", cls.get_source_record_id(source_record), ) - return content_types or None + return None @classmethod def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None: @@ -340,10 +337,9 @@ def _get_related_identifiers( @classmethod def get_languages(cls, source_record: Tag) -> list[str] | None: - languages = [] if language := source_record.metadata.find("language", string=True): - languages.append(str(language.string)) - return languages or None + return [str(language.string)] + return None def get_links(self, source_record: Tag) -> list[timdex.Link] | None: return [ @@ -380,31 +376,29 @@ def _get_resource_type_note(cls, source_record: Tag) -> Iterator[timdex.Note]: @classmethod def _get_description_notes(cls, source_record: Tag) -> Iterator[timdex.Note]: - descriptions = source_record.metadata.find_all("description", string=True) - for description in descriptions: + for description in source_record.metadata.find_all("description", string=True): + description_type = description.get("descriptionType") if "descriptionType" not in description.attrs: logger.warning( "Datacite record %s missing required Datacite attribute " "@descriptionType", cls.get_source_record_id(source_record), ) - if description.get("descriptionType") != "Abstract": + if description_type != "Abstract": yield timdex.Note( value=[str(description.string)], - kind=description.get("descriptionType") or None, + kind=description_type or None, ) @classmethod def get_publishers(cls, source_record: Tag) -> list[timdex.Publisher] | None: - publishers = [] if publisher := source_record.metadata.find("publisher", string=True): - publishers.append(timdex.Publisher(name=str(publisher.string))) - else: - logger.warning( - "Datacite record %s missing required Datacite field publisher", - cls.get_source_record_id(source_record), - ) - return publishers or None + return [timdex.Publisher(name=str(publisher.string))] + logger.warning( + "Datacite record %s missing required Datacite field publisher", + cls.get_source_record_id(source_record), + ) + return None @classmethod def get_related_items(cls, source_record: Tag) -> list[timdex.RelatedItem] | None: @@ -432,17 +426,11 @@ def get_rights(cls, source_record: Tag) -> list[timdex.Rights] | None: @classmethod def get_subjects(cls, source_record: Tag) -> list[timdex.Subject] | None: - subjects_dict: dict[str, list[str]] = {} - + subjects_dict = defaultdict(list) for subject in source_record.metadata.find_all("subject", string=True): - if not subject.get("subjectScheme"): - subjects_dict.setdefault("Subject scheme not provided", []).append( - str(subject.string) - ) - else: - subjects_dict.setdefault(subject["subjectScheme"], []).append( - str(subject.string) - ) + subjects_dict[ + subject.get("subjectScheme") or "Subject scheme not provided" + ].append(str(subject.string)) return [ timdex.Subject(value=subject_value, kind=subject_scheme)