diff --git a/tests/conftest.py b/tests/conftest.py
index 4e28fb5..4d154be 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,6 +5,7 @@
from transmogrifier.config import SOURCES, load_external_config
from transmogrifier.sources.jsontransformer import JSONTransformer
from transmogrifier.sources.xml.datacite import Datacite
+from transmogrifier.sources.xml.dspace_dim import DspaceDim
from transmogrifier.sources.xmltransformer import XMLTransformer
@@ -43,6 +44,14 @@ def runner():
return CliRunner()
+# aardvark ##########################
+
+
+@pytest.fixture
+def aardvark_records():
+ return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")
+
+
@pytest.fixture
def aardvark_record_all_fields():
return JSONTransformer.parse_source_file(
@@ -50,6 +59,9 @@ def aardvark_record_all_fields():
)
+# datacite ##########################
+
+
@pytest.fixture
def datacite_records():
return XMLTransformer.parse_source_file(
@@ -65,9 +77,42 @@ def datacite_record_all_fields():
return Datacite("cool-repo", source_records)
+# dspace_dim ##########################
+
+
@pytest.fixture
-def aardvark_records():
- return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")
+def dspace_dim_record_all_fields():
+ source_records = DspaceDim.parse_source_file(
+ "tests/fixtures/dspace/dspace_dim_record_all_fields.xml"
+ )
+ return next(source_records)
+
+
+@pytest.fixture
+def dspace_dim_record_attribute_and_subfield_variations():
+ source_records = DspaceDim.parse_source_file(
+ "tests/fixtures/dspace/dspace_dim_record_attribute_and_subfield_variations.xml"
+ )
+ return next(source_records)
+
+
+@pytest.fixture
+def dspace_dim_record_optional_fields_blank():
+ source_records = DspaceDim.parse_source_file(
+ "tests/fixtures/dspace/dspace_dim_record_optional_fields_blank.xml"
+ )
+ return next(source_records)
+
+
+@pytest.fixture
+def dspace_dim_record_optional_fields_missing():
+ source_records = DspaceDim.parse_source_file(
+ "tests/fixtures/dspace/dspace_dim_record_optional_fields_missing.xml"
+ )
+ return next(source_records)
+
+
+# marc ##########################
@pytest.fixture
@@ -80,11 +125,17 @@ def marc_content_type_crosswalk():
return load_external_config("config/marc_content_type_crosswalk.json", "json")
+# oaidc ##########################
+
+
@pytest.fixture
def oai_pmh_records():
return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml")
+# timdex ##########################
+
+
@pytest.fixture
def timdex_record_required_fields():
return timdex.TimdexRecord(
diff --git a/tests/fixtures/dspace/dspace_dim_record_attribute_variations.xml b/tests/fixtures/dspace/dspace_dim_record_attribute_and_subfield_variations.xml
similarity index 100%
rename from tests/fixtures/dspace/dspace_dim_record_attribute_variations.xml
rename to tests/fixtures/dspace/dspace_dim_record_attribute_and_subfield_variations.xml
diff --git a/tests/sources/xml/test_dspace_dim.py b/tests/sources/xml/test_dspace_dim.py
index 55acc77..6dd13fc 100644
--- a/tests/sources/xml/test_dspace_dim.py
+++ b/tests/sources/xml/test_dspace_dim.py
@@ -1,7 +1,28 @@
+# ruff: noqa: E501
+from bs4 import BeautifulSoup
+
import transmogrifier.models as timdex
from transmogrifier.sources.xml.dspace_dim import DspaceDim
+def create_dspace_dim_source_record_stub(xml_insert: str) -> BeautifulSoup:
+ xml_str = f"""
+
+
+
+
+ {xml_insert}
+
+
+
+
+ """
+ return BeautifulSoup(xml_str, "xml")
+
+
def test_dspace_dim_transform_with_all_fields_transforms_correctly():
source_records = DspaceDim.parse_source_file(
"tests/fixtures/dspace/dspace_dim_record_all_fields.xml"
@@ -133,7 +154,7 @@ def test_dspace_dim_transform_with_all_fields_transforms_correctly():
def test_dspace_dim_transform_with_attribute_variations_transforms_correctly():
source_records = DspaceDim.parse_source_file(
- "tests/fixtures/dspace/dspace_dim_record_attribute_variations.xml"
+ "tests/fixtures/dspace/dspace_dim_record_attribute_and_subfield_variations.xml"
)
output_records = DspaceDim("cool-repo", source_records)
assert next(output_records) == timdex.TimdexRecord(
@@ -216,3 +237,69 @@ def test_dspace_dim_transform_with_optional_fields_missing_transforms_correctly(
format="electronic resource",
content_type=["Not specified"],
)
+
+
+def test_get_contents_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ Chapter 1
+ """
+ )
+ assert DspaceDim.get_contents(source_record) == ["Chapter 1"]
+
+
+def test_get_contents_transforms_correctly_if_fields_blank(
+ dspace_dim_record_optional_fields_blank,
+):
+ assert DspaceDim.get_contents(dspace_dim_record_optional_fields_blank) == []
+
+
+def test_get_contents_transforms_correctly_if_fields_missing(
+ dspace_dim_record_optional_fields_missing,
+):
+ assert DspaceDim.get_contents(dspace_dim_record_optional_fields_missing) == []
+
+
+def test_get_dates_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ 1201-01-01 - 1965-12-21
+ 1201-01-01/1965-12-21
+ 2009-01-08T16:24:37Z
+ 2009-01-08T16:24:37Z
+ 2002-11
+ https://hdl.handle.net/1912/2641
+ """
+ )
+ assert DspaceDim.get_dates(source_record, "abc123") == [
+ timdex.Date(kind="accessioned", value="2009-01-08T16:24:37Z"),
+ timdex.Date(kind="available", value="2009-01-08T16:24:37Z"),
+ timdex.Date(kind="Publication date", value="2002-11"),
+ timdex.Date(kind="coverage", note="1201-01-01 - 1965-12-21"),
+ timdex.Date(
+ kind="coverage",
+ range=timdex.DateRange(gte="1201-01-01", lte="1965-12-21"),
+ ),
+ ]
+
+
+def test_get_dates_transforms_correctly_if_fields_blank(
+ dspace_dim_record_optional_fields_blank,
+):
+ assert DspaceDim.get_dates(dspace_dim_record_optional_fields_blank, "abc123") == []
+
+
+def test_get_dates_transforms_correctly_if_fields_missing(
+ dspace_dim_record_optional_fields_missing,
+):
+ assert DspaceDim.get_dates(dspace_dim_record_optional_fields_missing, "abc123") == []
+
+
+def test_get_dates_invalid_date_range_skipped():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ 2020-01-02/2019-01-01
+
+ """
+ )
+ assert DspaceDim.get_dates(source_record, "abc123") == []
diff --git a/transmogrifier/sources/xml/dspace_dim.py b/transmogrifier/sources/xml/dspace_dim.py
index 0e4714c..7bac193 100644
--- a/transmogrifier/sources/xml/dspace_dim.py
+++ b/transmogrifier/sources/xml/dspace_dim.py
@@ -1,4 +1,5 @@
import logging
+from collections.abc import Iterator
from bs4 import Tag # type: ignore[import-untyped]
@@ -12,23 +13,24 @@
class DspaceDim(XMLTransformer):
"""DSpace DIM transformer."""
- def get_optional_fields(self, xml: Tag) -> dict | None:
+ def get_optional_fields(self, source_record: Tag) -> dict | None:
"""
Retrieve optional TIMDEX fields from a DSpace DIM XML record.
Overrides metaclass get_optional_fields() method.
Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
+ source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
+ record.
"""
fields: dict = {}
- source_record_id = self.get_source_record_id(xml)
+ source_record_id = self.get_source_record_id(source_record)
# alternate_titles
for alternate_title in [
t
- for t in xml.find_all("dim:field", element="title")
+ for t in source_record.find_all("dim:field", element="title")
if "qualifier" in t.attrs and t.string
]:
fields.setdefault("alternate_titles", []).append(
@@ -38,35 +40,31 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)
)
# If the record has more than one main title, add extras to alternate_titles
- for index, title in enumerate(self.get_main_titles(xml)):
+ for index, title in enumerate(self.get_main_titles(source_record)):
if index > 0:
fields.setdefault("alternate_titles", []).append(
timdex.AlternateTitle(value=title)
)
# citation
- citation = xml.find("dim:field", element="identifier", qualifier="citation")
+ citation = source_record.find(
+ "dim:field", element="identifier", qualifier="citation"
+ )
fields["citation"] = citation.string if citation and citation.string else None
# content_type
- if content_types := self.get_content_types(xml):
+ if content_types := self.get_content_types(source_record):
if self.valid_content_types(content_types):
fields["content_type"] = content_types
else:
return None
# contents
- fields["contents"] = [
- t.string
- for t in xml.find_all(
- "dim:field", element="description", qualifier="tableofcontents"
- )
- if t.string
- ] or None
+ fields["contents"] = self.get_contents(source_record) or None
# contributors
for creator in [
- c for c in xml.find_all("dim:field", element="creator") if c.string
+ c for c in source_record.find_all("dim:field", element="creator") if c.string
]:
fields.setdefault("contributors", []).append(
timdex.Contributor(
@@ -76,7 +74,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)
for contributor in [
- c for c in xml.find_all("dim:field", element="contributor") if c.string
+ c
+ for c in source_record.find_all("dim:field", element="contributor")
+ if c.string
]:
fields.setdefault("contributors", []).append(
timdex.Contributor(
@@ -86,44 +86,12 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)
# dates
- for date in xml.find_all("dim:field", element="date", string=True):
- date_value = str(date.string.strip())
- if validate_date(date_value, source_record_id):
- if date.get("qualifier") == "issued":
- d = timdex.Date(value=date_value, kind="Publication date")
- else:
- d = timdex.Date(value=date_value, kind=date.get("qualifier") or None)
- fields.setdefault("dates", []).append(d)
-
- for coverage in [
- c.string
- for c in xml.find_all("dim:field", element="coverage", qualifier="temporal")
- if c.string
- ]:
- if "/" in coverage:
- split = coverage.index("/")
- gte_date = coverage[:split]
- lte_date = coverage[split + 1 :]
- if validate_date_range(
- gte_date,
- lte_date,
- source_record_id,
- ):
- d = timdex.Date(
- range=timdex.DateRange(
- gte=gte_date,
- lte=lte_date,
- ),
- kind="coverage",
- )
- else:
- d = timdex.Date(note=coverage.string, kind="coverage")
- fields.setdefault("dates", []).append(d)
+ fields["dates"] = self.get_dates(source_record, source_record_id) or None
# file_formats
fields["file_formats"] = [
f.string
- for f in xml.find_all("dim:field", element="format")
+ for f in source_record.find_all("dim:field", element="format")
if f.get("qualifier") == "mimetype" and f.string
] or None
@@ -133,7 +101,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# funding_information
for funding_reference in [
f
- for f in xml.find_all(
+ for f in source_record.find_all(
"dim:field", element="description", qualifier="sponsorship"
)
if f.string
@@ -145,7 +113,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)
# identifiers
- identifiers = xml.find_all("dim:field", element="identifier")
+ identifiers = source_record.find_all("dim:field", element="identifier")
for identifier in [
i for i in identifiers if i.get("qualifier") != "citation" and i.string
]:
@@ -158,7 +126,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# language
fields["languages"] = [
- la.string for la in xml.find_all("dim:field", element="language") if la.string
+ la.string
+ for la in source_record.find_all("dim:field", element="language")
+ if la.string
] or None
# links, uses identifiers list retrieved for identifiers field
@@ -176,12 +146,14 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# locations
fields["locations"] = [
timdex.Location(value=lo.string)
- for lo in xml.find_all("dim:field", element="coverage", qualifier="spatial")
+ for lo in source_record.find_all(
+ "dim:field", element="coverage", qualifier="spatial"
+ )
if lo.string
] or None
# notes
- descriptions = xml.find_all("dim:field", element="description")
+ descriptions = source_record.find_all("dim:field", element="description")
for description in [
d
for d in descriptions
@@ -204,13 +176,13 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# publishers
fields["publishers"] = [
timdex.Publisher(name=p.string)
- for p in xml.find_all("dim:field", element="publisher")
+ for p in source_record.find_all("dim:field", element="publisher")
if p.string
] or None
# related_items
for related_item in [
- r for r in xml.find_all("dim:field", element="relation") if r.string
+ r for r in source_record.find_all("dim:field", element="relation") if r.string
]:
if related_item.get("qualifier") == "uri":
ri = timdex.RelatedItem(
@@ -225,7 +197,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# rights
for rights in [
- r for r in xml.find_all("dim:field", element="rights") if r.string
+ r for r in source_record.find_all("dim:field", element="rights") if r.string
]:
if rights.get("qualifier") == "uri":
rg = timdex.Rights(uri=rights.string)
@@ -238,7 +210,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# subjects
subjects_dict: dict[str, list[str]] = {}
for subject in [
- s for s in xml.find_all("dim:field", element="subject") if s.string
+ s for s in source_record.find_all("dim:field", element="subject") if s.string
]:
if not subject.get("qualifier"):
subjects_dict.setdefault("Subject scheme not provided", []).append(
@@ -260,7 +232,82 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
return fields
@classmethod
- def get_content_types(cls, xml: Tag) -> list[str] | None:
+ def get_contents(cls, source_record: Tag) -> list[str]:
+ return [
+ str(contents.string)
+ for contents in source_record.find_all(
+ "dim:field",
+ element="description",
+ qualifier="tableofcontents",
+ string=True,
+ )
+ ]
+
+ @classmethod
+ def get_dates(cls, source_record: Tag, source_record_id: str) -> list[timdex.Date]:
+ dates = []
+ dates.extend(list(cls._parse_date_elements(source_record, source_record_id)))
+ dates.extend(list(cls._parse_coverage_elements(source_record, source_record_id)))
+ return dates
+
+ @classmethod
+ def _parse_date_elements(
+ cls, source_record: Tag, source_record_id: str
+ ) -> Iterator[timdex.Date]:
+ for date_element in source_record.find_all(
+ "dim:field", element="date", string=True
+ ):
+ date_value = str(date_element.string.strip())
+ if validate_date(date_value, source_record_id):
+ if date_element.get("qualifier") == "issued":
+ date_object = timdex.Date(value=date_value, kind="Publication date")
+ else:
+ date_object = timdex.Date(
+ value=date_value, kind=date_element.get("qualifier") or None
+ )
+ yield date_object
+
+ @classmethod
+ def _parse_coverage_elements(
+ cls, source_record: Tag, source_record_id: str
+ ) -> Iterator[timdex.Date]:
+ for coverage_value in [
+ str(coverage_element.string)
+ for coverage_element in source_record.find_all(
+ "dim:field", element="coverage", qualifier="temporal", string=True
+ )
+ ]:
+ if "/" in coverage_value:
+ date_object = cls._parse_date_range(coverage_value, source_record_id)
+ else:
+ date_object = timdex.Date(note=coverage_value, kind="coverage")
+ if date_object:
+ yield date_object
+
+ @classmethod
+ def _parse_date_range(
+ cls, coverage_value: Tag, source_record_id: str
+ ) -> timdex.Date | None:
+ """Parse date range value and return a Date object if it is validated."""
+ split = coverage_value.index("/")
+ gte_date = coverage_value[:split]
+ lte_date = coverage_value[split + 1 :]
+ if validate_date_range(
+ gte_date,
+ lte_date,
+ source_record_id,
+ ):
+ return timdex.Date(
+ range=timdex.DateRange(
+ gte=gte_date,
+ lte=lte_date,
+ ),
+ kind="coverage",
+ )
+ return None
+
+ @classmethod
+ def get_content_types(cls, source_record: Tag) -> list[str] | None:
"""
Retrieve content types from a DSpace DIM XML record.
@@ -268,39 +315,43 @@ def get_content_types(cls, xml: Tag) -> list[str] | None:
differently.
Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
+ source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
+ record.
"""
return [
- t.string for t in xml.find_all("dim:field", element="type", string=True)
+ t.string
+ for t in source_record.find_all("dim:field", element="type", string=True)
] or None
@classmethod
- def get_main_titles(cls, xml: Tag) -> list[str]:
+ def get_main_titles(cls, source_record: Tag) -> list[str]:
"""
Retrieve main title(s) from a DSpace DIM XML record.
Overrides metaclass get_main_titles() method.
Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
+ source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
+ record.
"""
return [
t.string
- for t in xml.find_all("dim:field", element="title", string=True)
+ for t in source_record.find_all("dim:field", element="title", string=True)
if "qualifier" not in t.attrs
]
@classmethod
- def get_source_record_id(cls, xml: Tag) -> str:
+ def get_source_record_id(cls, source_record: Tag) -> str:
"""
Get the source record ID from a DSpace DIM XML record.
Overrides metaclass get_source_record_id() method.
Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
+ source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
+ record.
"""
- return xml.header.identifier.string.split(":")[2]
+ return source_record.header.identifier.string.split(":")[2]
@classmethod
def valid_content_types(cls, _content_type_list: list[str]) -> bool: