From c070953c80a51383ae95a61b4ba2eefe1018f01d Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Thu, 25 Apr 2024 11:46:29 -0400 Subject: [PATCH 1/4] Exploratory field method refactor Why these changes are being introduced: * This commit serves as a starting point for discussing the field method refactor of this application How this addresses that need: * Add get_contents and get_dates field method as examples for future refactoring * Add private methods for get_dates methods as example of breaking up large code blocks * Add unit tests as example of expected tests for all future field methods * Add dspace_dim fixtures as examples for future test suite refactoring * Organize fixtures in conftest.py Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-273 --- tests/conftest.py | 63 +++++++++- ...ord_attribute_and_subfield_variations.xml} | 0 .../dspace/dspace_dim_record_errors.xml | 12 ++ tests/sources/xml/test_dspace_dim.py | 47 +++++++- transmogrifier/sources/xml/dspace_dim.py | 109 +++++++++++------- 5 files changed, 188 insertions(+), 43 deletions(-) rename tests/fixtures/dspace/{dspace_dim_record_attribute_variations.xml => dspace_dim_record_attribute_and_subfield_variations.xml} (100%) create mode 100644 tests/fixtures/dspace/dspace_dim_record_errors.xml diff --git a/tests/conftest.py b/tests/conftest.py index 4e28fb5..4b803e4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ from transmogrifier.config import SOURCES, load_external_config from transmogrifier.sources.jsontransformer import JSONTransformer from transmogrifier.sources.xml.datacite import Datacite +from transmogrifier.sources.xml.dspace_dim import DspaceDim from transmogrifier.sources.xmltransformer import XMLTransformer @@ -43,6 +44,14 @@ def runner(): return CliRunner() +# aardvark ########################## + + +@pytest.fixture +def aardvark_records(): + return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl") + + @pytest.fixture def aardvark_record_all_fields(): return JSONTransformer.parse_source_file( @@ -50,6 +59,9 @@ def aardvark_record_all_fields(): ) +# datacite ########################## + + @pytest.fixture def datacite_records(): return XMLTransformer.parse_source_file( @@ -65,9 +77,50 @@ def datacite_record_all_fields(): return Datacite("cool-repo", source_records) +# dspace_dim ########################## + + @pytest.fixture -def aardvark_records(): - return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl") +def dspace_dim_record_all_fields(): + source_records = DspaceDim.parse_source_file( + "tests/fixtures/dspace/dspace_dim_record_all_fields.xml" + ) + return next(source_records) + + +@pytest.fixture +def dspace_dim_record_attribute_and_subfield_variations(): + source_records = DspaceDim.parse_source_file( + "tests/fixtures/dspace/dspace_dim_record_attribute_and_subfield_variations.xml" + ) + return next(source_records) + + +@pytest.fixture +def dspace_dim_record_errors(): + source_records = DspaceDim.parse_source_file( + "tests/fixtures/dspace/dspace_dim_record_errors.xml" + ) + return next(source_records) + + +@pytest.fixture +def dspace_dim_record_optional_fields_blank(): + source_records = DspaceDim.parse_source_file( + "tests/fixtures/dspace/dspace_dim_record_optional_fields_blank.xml" + ) + return next(source_records) + + +@pytest.fixture +def dspace_dim_record_optional_fields_missing(): + source_records = DspaceDim.parse_source_file( + "tests/fixtures/dspace/dspace_dim_record_optional_fields_missing.xml" + ) + return next(source_records) + + +# marc ########################## @pytest.fixture @@ -80,11 +133,17 @@ def marc_content_type_crosswalk(): return load_external_config("config/marc_content_type_crosswalk.json", "json") +# oaidc ########################## + + @pytest.fixture def oai_pmh_records(): return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml") +# timdex ########################## + + @pytest.fixture def timdex_record_required_fields(): return timdex.TimdexRecord( diff --git a/tests/fixtures/dspace/dspace_dim_record_attribute_variations.xml b/tests/fixtures/dspace/dspace_dim_record_attribute_and_subfield_variations.xml similarity index 100% rename from tests/fixtures/dspace/dspace_dim_record_attribute_variations.xml rename to tests/fixtures/dspace/dspace_dim_record_attribute_and_subfield_variations.xml diff --git a/tests/fixtures/dspace/dspace_dim_record_errors.xml b/tests/fixtures/dspace/dspace_dim_record_errors.xml new file mode 100644 index 0000000..2307507 --- /dev/null +++ b/tests/fixtures/dspace/dspace_dim_record_errors.xml @@ -0,0 +1,12 @@ + + + + + 2020-01-02/2019-01-01 + + + + \ No newline at end of file diff --git a/tests/sources/xml/test_dspace_dim.py b/tests/sources/xml/test_dspace_dim.py index 55acc77..68a3ead 100644 --- a/tests/sources/xml/test_dspace_dim.py +++ b/tests/sources/xml/test_dspace_dim.py @@ -133,7 +133,7 @@ def test_dspace_dim_transform_with_all_fields_transforms_correctly(): def test_dspace_dim_transform_with_attribute_variations_transforms_correctly(): source_records = DspaceDim.parse_source_file( - "tests/fixtures/dspace/dspace_dim_record_attribute_variations.xml" + "tests/fixtures/dspace/dspace_dim_record_attribute_and_subfield_variations.xml" ) output_records = DspaceDim("cool-repo", source_records) assert next(output_records) == timdex.TimdexRecord( @@ -216,3 +216,48 @@ def test_dspace_dim_transform_with_optional_fields_missing_transforms_correctly( format="electronic resource", content_type=["Not specified"], ) + + +def test_get_contents_success(dspace_dim_record_all_fields): + assert DspaceDim.get_contents(dspace_dim_record_all_fields) == ["Chapter 1"] + + +def test_get_contents_transforms_correctly_if_fields_blank( + dspace_dim_record_optional_fields_blank, +): + assert DspaceDim.get_contents(dspace_dim_record_optional_fields_blank) == [] + + +def test_get_contents_transforms_correctly_if_fields_missing( + dspace_dim_record_optional_fields_missing, +): + assert DspaceDim.get_contents(dspace_dim_record_optional_fields_missing) == [] + + +def test_get_dates_success(dspace_dim_record_all_fields): + assert DspaceDim.get_dates(dspace_dim_record_all_fields, "abc123") == [ + timdex.Date(kind="accessioned", value="2009-01-08T16:24:37Z"), + timdex.Date(kind="available", value="2009-01-08T16:24:37Z"), + timdex.Date(kind="Publication date", value="2002-11"), + timdex.Date(kind="coverage", note="1201-01-01 - 1965-12-21"), + timdex.Date( + kind="coverage", + range=timdex.DateRange(gte="1201-01-01", lte="1965-12-21"), + ), + ] + + +def test_get_dates_transforms_correctly_if_fields_blank( + dspace_dim_record_optional_fields_blank, +): + assert DspaceDim.get_dates(dspace_dim_record_optional_fields_blank, "abc123") == [] + + +def test_get_dates_transforms_correctly_if_fields_missing( + dspace_dim_record_optional_fields_missing, +): + assert DspaceDim.get_dates(dspace_dim_record_optional_fields_missing, "abc123") == [] + + +def test_get_dates_invalid_date_range_skipped(dspace_dim_record_errors): + assert DspaceDim.get_dates(dspace_dim_record_errors, "abc123") == [] diff --git a/transmogrifier/sources/xml/dspace_dim.py b/transmogrifier/sources/xml/dspace_dim.py index 0e4714c..b1550fd 100644 --- a/transmogrifier/sources/xml/dspace_dim.py +++ b/transmogrifier/sources/xml/dspace_dim.py @@ -56,13 +56,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: return None # contents - fields["contents"] = [ - t.string - for t in xml.find_all( - "dim:field", element="description", qualifier="tableofcontents" - ) - if t.string - ] or None + fields["contents"] = self.get_contents(xml) or None # contributors for creator in [ @@ -86,39 +80,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) # dates - for date in xml.find_all("dim:field", element="date", string=True): - date_value = str(date.string.strip()) - if validate_date(date_value, source_record_id): - if date.get("qualifier") == "issued": - d = timdex.Date(value=date_value, kind="Publication date") - else: - d = timdex.Date(value=date_value, kind=date.get("qualifier") or None) - fields.setdefault("dates", []).append(d) - - for coverage in [ - c.string - for c in xml.find_all("dim:field", element="coverage", qualifier="temporal") - if c.string - ]: - if "/" in coverage: - split = coverage.index("/") - gte_date = coverage[:split] - lte_date = coverage[split + 1 :] - if validate_date_range( - gte_date, - lte_date, - source_record_id, - ): - d = timdex.Date( - range=timdex.DateRange( - gte=gte_date, - lte=lte_date, - ), - kind="coverage", - ) - else: - d = timdex.Date(note=coverage.string, kind="coverage") - fields.setdefault("dates", []).append(d) + fields["dates"] = self.get_dates(xml, source_record_id) or None # file_formats fields["file_formats"] = [ @@ -259,6 +221,73 @@ def get_optional_fields(self, xml: Tag) -> dict | None: return fields + @classmethod + def get_contents(cls, xml: Tag) -> list[str]: + return [ + str(contents.string) + for contents in xml.find_all( + "dim:field", + element="description", + qualifier="tableofcontents", + string=True, + ) + ] + + @classmethod + def get_dates(cls, xml: Tag, source_record_id: str) -> list[timdex.Date]: + dates = [] + for date_element in xml.find_all("dim:field", element="date", string=True): + date_value = str(date_element.string.strip()) + if validate_date(date_value, source_record_id): + if date_element.get("qualifier") == "issued": + date_object = timdex.Date(value=date_value, kind="Publication date") + else: + date_object = timdex.Date( + value=date_value, kind=date_element.get("qualifier") or None + ) + dates.append(date_object) + dates.extend(cls._get_coverage_dates(xml, source_record_id)) + return dates + + @classmethod + def _get_coverage_dates(cls, xml: Tag, source_record_id: str) -> list[timdex.Date]: + coverage_dates = [] + for coverage_value in [ + str(coverage_element.string) + for coverage_element in xml.find_all( + "dim:field", element="coverage", qualifier="temporal", string=True + ) + ]: + if "/" in coverage_value: + date_object = cls._parse_date_range(coverage_value, source_record_id) + else: + date_object = timdex.Date(note=coverage_value, kind="coverage") + if date_object: + coverage_dates.append(date_object) + return coverage_dates + + @classmethod + def _parse_date_range( + cls, coverage_value: Tag, source_record_id: str + ) -> timdex.Date | None: + """Parse date range value and return a Date object if it is validated.""" + split = coverage_value.index("/") + gte_date = coverage_value[:split] + lte_date = coverage_value[split + 1 :] + if validate_date_range( + gte_date, + lte_date, + source_record_id, + ): + return timdex.Date( + range=timdex.DateRange( + gte=gte_date, + lte=lte_date, + ), + kind="coverage", + ) + return None + @classmethod def get_content_types(cls, xml: Tag) -> list[str] | None: """ From f29279ccc2545cb848e8d62df58fd6ad4fd3695e Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Tue, 30 Apr 2024 10:18:18 -0400 Subject: [PATCH 2/4] Updates based on discussion in PR # 166 * Rename field method param xml > source_record * Refactor get_dates method for clarity * Remove _errors fixture * Add create_dspace_dim_source_record_stub function and use stubs records in dspace_dim unit tests --- tests/conftest.py | 8 --- .../dspace/dspace_dim_record_errors.xml | 12 ---- tests/sources/xml/test_dspace_dim.py | 57 +++++++++++++++++-- transmogrifier/sources/xml/dspace_dim.py | 33 +++++++---- 4 files changed, 72 insertions(+), 38 deletions(-) delete mode 100644 tests/fixtures/dspace/dspace_dim_record_errors.xml diff --git a/tests/conftest.py b/tests/conftest.py index 4b803e4..4d154be 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -96,14 +96,6 @@ def dspace_dim_record_attribute_and_subfield_variations(): return next(source_records) -@pytest.fixture -def dspace_dim_record_errors(): - source_records = DspaceDim.parse_source_file( - "tests/fixtures/dspace/dspace_dim_record_errors.xml" - ) - return next(source_records) - - @pytest.fixture def dspace_dim_record_optional_fields_blank(): source_records = DspaceDim.parse_source_file( diff --git a/tests/fixtures/dspace/dspace_dim_record_errors.xml b/tests/fixtures/dspace/dspace_dim_record_errors.xml deleted file mode 100644 index 2307507..0000000 --- a/tests/fixtures/dspace/dspace_dim_record_errors.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - 2020-01-02/2019-01-01 - - - - \ No newline at end of file diff --git a/tests/sources/xml/test_dspace_dim.py b/tests/sources/xml/test_dspace_dim.py index 68a3ead..a37460b 100644 --- a/tests/sources/xml/test_dspace_dim.py +++ b/tests/sources/xml/test_dspace_dim.py @@ -1,7 +1,27 @@ +from bs4 import BeautifulSoup + import transmogrifier.models as timdex from transmogrifier.sources.xml.dspace_dim import DspaceDim +def create_dspace_dim_source_record_stub(xml_insert: str) -> BeautifulSoup: + xml_str = f""" + + + + + {xml_insert} + + + + + """ + return BeautifulSoup(xml_str, "xml") + + def test_dspace_dim_transform_with_all_fields_transforms_correctly(): source_records = DspaceDim.parse_source_file( "tests/fixtures/dspace/dspace_dim_record_all_fields.xml" @@ -218,8 +238,14 @@ def test_dspace_dim_transform_with_optional_fields_missing_transforms_correctly( ) -def test_get_contents_success(dspace_dim_record_all_fields): - assert DspaceDim.get_contents(dspace_dim_record_all_fields) == ["Chapter 1"] +def test_get_contents_success(): + source_record = create_dspace_dim_source_record_stub( + """ + Chapter 1 + """ + ) + assert DspaceDim.get_contents(source_record) == ["Chapter 1"] def test_get_contents_transforms_correctly_if_fields_blank( @@ -234,8 +260,21 @@ def test_get_contents_transforms_correctly_if_fields_missing( assert DspaceDim.get_contents(dspace_dim_record_optional_fields_missing) == [] -def test_get_dates_success(dspace_dim_record_all_fields): - assert DspaceDim.get_dates(dspace_dim_record_all_fields, "abc123") == [ +def test_get_dates_success(): + source_record = create_dspace_dim_source_record_stub( + '' + "1201-01-01 - 1965-12-21" + '' + "1201-01-01/1965-12-21" + '' + "2009-01-08T16:24:37Z" + '' + "2009-01-08T16:24:37Z" + '2002-11' + '' + "https://hdl.handle.net/1912/2641" + ) + assert DspaceDim.get_dates(source_record, "abc123") == [ timdex.Date(kind="accessioned", value="2009-01-08T16:24:37Z"), timdex.Date(kind="available", value="2009-01-08T16:24:37Z"), timdex.Date(kind="Publication date", value="2002-11"), @@ -259,5 +298,11 @@ def test_get_dates_transforms_correctly_if_fields_missing( assert DspaceDim.get_dates(dspace_dim_record_optional_fields_missing, "abc123") == [] -def test_get_dates_invalid_date_range_skipped(dspace_dim_record_errors): - assert DspaceDim.get_dates(dspace_dim_record_errors, "abc123") == [] +def test_get_dates_invalid_date_range_skipped(): + source_record = create_dspace_dim_source_record_stub( + """ + 2020-01-02/2019-01-01 + + """ + ) + assert DspaceDim.get_dates(source_record, "abc123") == [] diff --git a/transmogrifier/sources/xml/dspace_dim.py b/transmogrifier/sources/xml/dspace_dim.py index b1550fd..2c7cc20 100644 --- a/transmogrifier/sources/xml/dspace_dim.py +++ b/transmogrifier/sources/xml/dspace_dim.py @@ -1,4 +1,5 @@ import logging +from collections.abc import Iterator from bs4 import Tag # type: ignore[import-untyped] @@ -222,10 +223,10 @@ def get_optional_fields(self, xml: Tag) -> dict | None: return fields @classmethod - def get_contents(cls, xml: Tag) -> list[str]: + def get_contents(cls, source_record: Tag) -> list[str]: return [ str(contents.string) - for contents in xml.find_all( + for contents in source_record.find_all( "dim:field", element="description", qualifier="tableofcontents", @@ -234,9 +235,19 @@ def get_contents(cls, xml: Tag) -> list[str]: ] @classmethod - def get_dates(cls, xml: Tag, source_record_id: str) -> list[timdex.Date]: + def get_dates(cls, source_record: Tag, source_record_id: str) -> list[timdex.Date]: dates = [] - for date_element in xml.find_all("dim:field", element="date", string=True): + dates.extend(list(cls._parse_date_elements(source_record, source_record_id))) + dates.extend(list(cls._parse_coverage_elements(source_record, source_record_id))) + return dates + + @classmethod + def _parse_date_elements( + cls, source_record: Tag, source_record_id: str + ) -> Iterator[timdex.Date]: + for date_element in source_record.find_all( + "dim:field", element="date", string=True + ): date_value = str(date_element.string.strip()) if validate_date(date_value, source_record_id): if date_element.get("qualifier") == "issued": @@ -245,16 +256,15 @@ def get_dates(cls, xml: Tag, source_record_id: str) -> list[timdex.Date]: date_object = timdex.Date( value=date_value, kind=date_element.get("qualifier") or None ) - dates.append(date_object) - dates.extend(cls._get_coverage_dates(xml, source_record_id)) - return dates + yield date_object @classmethod - def _get_coverage_dates(cls, xml: Tag, source_record_id: str) -> list[timdex.Date]: - coverage_dates = [] + def _parse_coverage_elements( + cls, source_record: Tag, source_record_id: str + ) -> Iterator[timdex.Date]: for coverage_value in [ str(coverage_element.string) - for coverage_element in xml.find_all( + for coverage_element in source_record.find_all( "dim:field", element="coverage", qualifier="temporal", string=True ) ]: @@ -263,8 +273,7 @@ def _get_coverage_dates(cls, xml: Tag, source_record_id: str) -> list[timdex.Dat else: date_object = timdex.Date(note=coverage_value, kind="coverage") if date_object: - coverage_dates.append(date_object) - return coverage_dates + yield date_object @classmethod def _parse_date_range( From 7f742e268880fb58e6d6bbbc47e2b5e6ab9cd58d Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Tue, 30 Apr 2024 13:46:37 -0400 Subject: [PATCH 3/4] Update formatting in test_dspace_dim.py --- tests/sources/xml/test_dspace_dim.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/tests/sources/xml/test_dspace_dim.py b/tests/sources/xml/test_dspace_dim.py index a37460b..6dd13fc 100644 --- a/tests/sources/xml/test_dspace_dim.py +++ b/tests/sources/xml/test_dspace_dim.py @@ -1,3 +1,4 @@ +# ruff: noqa: E501 from bs4 import BeautifulSoup import transmogrifier.models as timdex @@ -241,8 +242,7 @@ def test_dspace_dim_transform_with_optional_fields_missing_transforms_correctly( def test_get_contents_success(): source_record = create_dspace_dim_source_record_stub( """ - Chapter 1 + Chapter 1 """ ) assert DspaceDim.get_contents(source_record) == ["Chapter 1"] @@ -262,17 +262,14 @@ def test_get_contents_transforms_correctly_if_fields_missing( def test_get_dates_success(): source_record = create_dspace_dim_source_record_stub( - '' - "1201-01-01 - 1965-12-21" - '' - "1201-01-01/1965-12-21" - '' - "2009-01-08T16:24:37Z" - '' - "2009-01-08T16:24:37Z" - '2002-11' - '' - "https://hdl.handle.net/1912/2641" + """ + 1201-01-01 - 1965-12-21 + 1201-01-01/1965-12-21 + 2009-01-08T16:24:37Z + 2009-01-08T16:24:37Z + 2002-11 + https://hdl.handle.net/1912/2641 + """ ) assert DspaceDim.get_dates(source_record, "abc123") == [ timdex.Date(kind="accessioned", value="2009-01-08T16:24:37Z"), From cee6ffc00cae1f11f454ee9d8b3e156181a57753 Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Thu, 2 May 2024 13:43:23 -0400 Subject: [PATCH 4/4] Rename xml > source_record --- transmogrifier/sources/xml/dspace_dim.py | 73 ++++++++++++++---------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/transmogrifier/sources/xml/dspace_dim.py b/transmogrifier/sources/xml/dspace_dim.py index 2c7cc20..7bac193 100644 --- a/transmogrifier/sources/xml/dspace_dim.py +++ b/transmogrifier/sources/xml/dspace_dim.py @@ -13,23 +13,24 @@ class DspaceDim(XMLTransformer): """DSpace DIM transformer.""" - def get_optional_fields(self, xml: Tag) -> dict | None: + def get_optional_fields(self, source_record: Tag) -> dict | None: """ Retrieve optional TIMDEX fields from a DSpace DIM XML record. Overrides metaclass get_optional_fields() method. Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. + source_record: A BeautifulSoup Tag representing a single DSpace DIM XML + record. """ fields: dict = {} - source_record_id = self.get_source_record_id(xml) + source_record_id = self.get_source_record_id(source_record) # alternate_titles for alternate_title in [ t - for t in xml.find_all("dim:field", element="title") + for t in source_record.find_all("dim:field", element="title") if "qualifier" in t.attrs and t.string ]: fields.setdefault("alternate_titles", []).append( @@ -39,29 +40,31 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) ) # If the record has more than one main title, add extras to alternate_titles - for index, title in enumerate(self.get_main_titles(xml)): + for index, title in enumerate(self.get_main_titles(source_record)): if index > 0: fields.setdefault("alternate_titles", []).append( timdex.AlternateTitle(value=title) ) # citation - citation = xml.find("dim:field", element="identifier", qualifier="citation") + citation = source_record.find( + "dim:field", element="identifier", qualifier="citation" + ) fields["citation"] = citation.string if citation and citation.string else None # content_type - if content_types := self.get_content_types(xml): + if content_types := self.get_content_types(source_record): if self.valid_content_types(content_types): fields["content_type"] = content_types else: return None # contents - fields["contents"] = self.get_contents(xml) or None + fields["contents"] = self.get_contents(source_record) or None # contributors for creator in [ - c for c in xml.find_all("dim:field", element="creator") if c.string + c for c in source_record.find_all("dim:field", element="creator") if c.string ]: fields.setdefault("contributors", []).append( timdex.Contributor( @@ -71,7 +74,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) for contributor in [ - c for c in xml.find_all("dim:field", element="contributor") if c.string + c + for c in source_record.find_all("dim:field", element="contributor") + if c.string ]: fields.setdefault("contributors", []).append( timdex.Contributor( @@ -81,12 +86,12 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) # dates - fields["dates"] = self.get_dates(xml, source_record_id) or None + fields["dates"] = self.get_dates(source_record, source_record_id) or None # file_formats fields["file_formats"] = [ f.string - for f in xml.find_all("dim:field", element="format") + for f in source_record.find_all("dim:field", element="format") if f.get("qualifier") == "mimetype" and f.string ] or None @@ -96,7 +101,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # funding_information for funding_reference in [ f - for f in xml.find_all( + for f in source_record.find_all( "dim:field", element="description", qualifier="sponsorship" ) if f.string @@ -108,7 +113,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) # identifiers - identifiers = xml.find_all("dim:field", element="identifier") + identifiers = source_record.find_all("dim:field", element="identifier") for identifier in [ i for i in identifiers if i.get("qualifier") != "citation" and i.string ]: @@ -121,7 +126,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # language fields["languages"] = [ - la.string for la in xml.find_all("dim:field", element="language") if la.string + la.string + for la in source_record.find_all("dim:field", element="language") + if la.string ] or None # links, uses identifiers list retrieved for identifiers field @@ -139,12 +146,14 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # locations fields["locations"] = [ timdex.Location(value=lo.string) - for lo in xml.find_all("dim:field", element="coverage", qualifier="spatial") + for lo in source_record.find_all( + "dim:field", element="coverage", qualifier="spatial" + ) if lo.string ] or None # notes - descriptions = xml.find_all("dim:field", element="description") + descriptions = source_record.find_all("dim:field", element="description") for description in [ d for d in descriptions @@ -167,13 +176,13 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # publishers fields["publishers"] = [ timdex.Publisher(name=p.string) - for p in xml.find_all("dim:field", element="publisher") + for p in source_record.find_all("dim:field", element="publisher") if p.string ] or None # related_items for related_item in [ - r for r in xml.find_all("dim:field", element="relation") if r.string + r for r in source_record.find_all("dim:field", element="relation") if r.string ]: if related_item.get("qualifier") == "uri": ri = timdex.RelatedItem( @@ -188,7 +197,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # rights for rights in [ - r for r in xml.find_all("dim:field", element="rights") if r.string + r for r in source_record.find_all("dim:field", element="rights") if r.string ]: if rights.get("qualifier") == "uri": rg = timdex.Rights(uri=rights.string) @@ -201,7 +210,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # subjects subjects_dict: dict[str, list[str]] = {} for subject in [ - s for s in xml.find_all("dim:field", element="subject") if s.string + s for s in source_record.find_all("dim:field", element="subject") if s.string ]: if not subject.get("qualifier"): subjects_dict.setdefault("Subject scheme not provided", []).append( @@ -298,7 +307,7 @@ def _parse_date_range( return None @classmethod - def get_content_types(cls, xml: Tag) -> list[str] | None: + def get_content_types(cls, source_record: Tag) -> list[str] | None: """ Retrieve content types from a DSpace DIM XML record. @@ -306,39 +315,43 @@ def get_content_types(cls, xml: Tag) -> list[str] | None: differently. Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. + source_record: A BeautifulSoup Tag representing a single DSpace DIM XML + record. """ return [ - t.string for t in xml.find_all("dim:field", element="type", string=True) + t.string + for t in source_record.find_all("dim:field", element="type", string=True) ] or None @classmethod - def get_main_titles(cls, xml: Tag) -> list[str]: + def get_main_titles(cls, source_record: Tag) -> list[str]: """ Retrieve main title(s) from a DSpace DIM XML record. Overrides metaclass get_main_titles() method. Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. + source_record: A BeautifulSoup Tag representing a single DSpace DIM XML + record. """ return [ t.string - for t in xml.find_all("dim:field", element="title", string=True) + for t in source_record.find_all("dim:field", element="title", string=True) if "qualifier" not in t.attrs ] @classmethod - def get_source_record_id(cls, xml: Tag) -> str: + def get_source_record_id(cls, source_record: Tag) -> str: """ Get the source record ID from a DSpace DIM XML record. Overrides metaclass get_source_record_id() method. Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. + source_record: A BeautifulSoup Tag representing a single DSpace DIM XML + record. """ - return xml.header.identifier.string.split(":")[2] + return source_record.header.identifier.string.split(":")[2] @classmethod def valid_content_types(cls, _content_type_list: list[str]) -> bool: