Skip to content

Commit

Permalink
Merge pull request #124 from MITLibraries/GDT-109-mitaardvark-adjustm…
Browse files Browse the repository at this point in the history
…ents

Validate date strings when transforming MITAardvark records
  • Loading branch information
ghukill authored Feb 14, 2024
2 parents 91980e8 + aa84d19 commit 06de472
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 20 deletions.
6 changes: 3 additions & 3 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "ogm:456", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 3", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": true, "id": "ogm:789", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "ogm:456", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 3", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": true, "id": "ogm:789", "locn_geometry": ""}
85 changes: 82 additions & 3 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from pathlib import Path

import pytest
Expand Down Expand Up @@ -32,7 +33,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert transformer.get_required_fields(next(aardvark_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/gismit:123",
"source_link": "https://geodata.libraries.mit.edu/record/abc:123",
"timdex_record_id": "cool-repo:123",
"title": "Test title 1",
}
Expand All @@ -42,12 +43,21 @@ def test_aardvark_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/gismit:123",
source_link="https://geodata.libraries.mit.edu/record/abc:123",
timdex_record_id="cool-repo:123",
title="Test title 1",
citation="Test title 1. Geospatial data. https://example.com/gismit:123",
citation="Test title 1. Geospatial data. "
"https://geodata.libraries.mit.edu/record/abc:123",
content_type=["Geospatial data"],
rights=[timdex.Rights(description="Access rights", kind="Access")],
links=[
timdex.Link(
url="https://geodata.libraries.mit.edu/record/abc:123",
kind="Website",
restrictions=None,
text="Website",
)
],
)


Expand Down Expand Up @@ -137,6 +147,39 @@ def test_aardvark_get_dates_success(aardvark_record_all_fields):
]


def test_aardvark_get_dates_drops_dates_with_invalid_strings(
caplog, aardvark_record_all_fields
):
caplog.set_level("DEBUG")
record = next(aardvark_record_all_fields)
record["dct_issued_s"] = "1933?" # dropped
record["dct_temporal_sm"] = [
"2000-01-01",
"1999",
"approximately 1569", # dropped
"absolute junky date", # dropped
]
record["gbl_dateRange_drsim"] = [
"[1943 TO 1946]",
"[apples TO oranges]", # logged and dropped
]
assert MITAardvark.get_dates(record, "123") == [
timdex.Date(kind="Coverage", note=None, range=None, value="2000-01-01"),
timdex.Date(kind="Coverage", note=None, range=None, value="1999"),
timdex.Date(kind="Coverage", note=None, range=None, value="1943"),
timdex.Date(kind="Coverage", note=None, range=None, value="1944"),
timdex.Date(kind="Coverage", note=None, range=None, value="1945"),
timdex.Date(kind="Coverage", note=None, range=None, value="1946"),
timdex.Date(
kind="Coverage",
note=None,
range=timdex.DateRange(gt=None, gte="1943", lt=None, lte="1946"),
value=None,
),
]
assert "Unable to parse date range string" in caplog.text


def test_aardvark_parse_solr_date_range_string_success():
assert MITAardvark.parse_solr_date_range_string("[1932 TO 1937]", "123") == (
"1932",
Expand Down Expand Up @@ -277,3 +320,39 @@ def test_aardvark_get_subjects_success(aardvark_record_all_fields):
timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
]


def test_aardvark_record_get_source_link_success(
aardvark_record_all_fields,
):
record = next(aardvark_record_all_fields)
url_from_aardvark_record = "https://geodata.libraries.mit.edu/record/abc:123"
record["dct_references_s"] = json.dumps(
{"http://schema.org/url": url_from_aardvark_record}
)
assert (
MITAardvark.get_source_link(
"None",
"abc:123",
record,
)
== url_from_aardvark_record
)


def test_aardvark_record_get_source_link_bad_dct_references_s_raises_error(
aardvark_record_all_fields,
):
record = next(aardvark_record_all_fields)
record["dct_references_s"] = json.dumps(
{"missing data": "from aardvark from geoharvester"}
)
with pytest.raises(
ValueError,
match="Could not locate a kind=Website link to pull the source link from.",
):
MITAardvark.get_source_link(
"None",
"abc:123",
record,
)
6 changes: 3 additions & 3 deletions transmogrifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,13 @@
},
"gismit": {
"name": "MIT GIS Resources",
"base-url": "https://search.libraries.mit.edu/record/",
"base-url": "None",
"transform-class": "transmogrifier.sources.json.aardvark.MITAardvark",
},
"gisogm": {
"name": "OpenGeoMetadata GIS Resources",
"base-url": "https://search.libraries.mit.edu/record/",
"transform-class": "transmogrifier.sources.json.aardvark.OGMAardvark",
"base-url": "None",
"transform-class": "transmogrifier.sources.json.aardvark.MITAardvark",
},
"researchdatabases": {
"name": "Research Databases",
Expand Down
49 changes: 38 additions & 11 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re

import transmogrifier.models as timdex
from transmogrifier.helpers import validate_date
from transmogrifier.sources.transformer import JSON, JSONTransformer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -31,25 +32,33 @@ def get_main_titles(cls, source_record: dict) -> list[str]:

@classmethod
def get_source_link(
cls, source_base_url: str, source_record_id: str, source_record: dict[str, JSON]
cls,
_source_base_url: str,
source_record_id: str,
source_record: dict[str, JSON],
) -> str:
"""
Class method to set the source link for the item.
May be overridden by source subclasses if needed.
Default behavior is to concatenate the source base URL + source record id.
Unlike other Transmogrifier sources that dynamically build a source link,
MITAardvark files are expected to have a fully formed and appropriate source link
in the metadata already. This method relies on that data.
Args:
source_base_url: Source base URL.
_source_base_url: Source base URL. Not used for MITAardvark transforms.
source_record_id: Record identifier for the source record.
source_record: A BeautifulSoup Tag representing a single XML record.
- not used by default implementation, but could be useful for subclass
overrides
"""
return source_base_url + cls.get_timdex_record_id(
"gismit", source_record_id, source_record
)
links = cls.get_links(source_record, source_record_id)
url_links = [link for link in links if link.kind == "Website"]
if len(url_links) == 1:
return url_links[0].url
message = "Could not locate a kind=Website link to pull the source link from."
raise ValueError(message)

@classmethod
def get_timdex_record_id(
Expand Down Expand Up @@ -183,12 +192,25 @@ def get_contributors(source_record: dict) -> list[timdex.Contributor]:

@classmethod
def get_dates(cls, source_record: dict, source_record_id: str) -> list[timdex.Date]:
"""Get values from source record for TIMDEX dates field."""
return (
"""Get values from source record for TIMDEX dates field.
This method aggregates dates from a variety of Aardvark fields. Once aggregated,
the results are filtered to allow only well formed DateRanges or validated date
strings.
"""
dates = (
cls._issued_dates(source_record)
+ cls._coverage_dates(source_record)
+ cls._range_dates(source_record, source_record_id)
)
return [
date
for date in dates
# skip value validation for DateRange type dates
if isinstance(date.range, timdex.DateRange)
# validate date string if not None
or (date.value is not None and validate_date(date.value, source_record_id))
]

@classmethod
def _issued_dates(cls, source_record: dict) -> list[timdex.Date]:
Expand Down Expand Up @@ -228,9 +250,13 @@ def _range_dates(
"""Get values for issued dates."""
range_dates = []
for date_range_string in source_record.get("gbl_dateRange_drsim", []):
date_range_values = cls.parse_solr_date_range_string(
date_range_string, source_record_id
)
try:
date_range_values = cls.parse_solr_date_range_string(
date_range_string, source_record_id
)
except ValueError as exc:
logger.warning(exc)
continue
range_dates.append(
timdex.Date(
kind="Coverage",
Expand Down Expand Up @@ -292,6 +318,7 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
url=link.get("url"), kind="Download", text=link.get("label")
)
for link in links_object.get("http://schema.org/downloadUrl", [])
if isinstance(link.get("url", {}), str)
]
)
if schema_url := links_object.get("http://schema.org/url"):
Expand Down

0 comments on commit 06de472

Please sign in to comment.