Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate date strings when transforming MITAardvark records #124

Merged
merged 5 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "ogm:456", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 3", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": true, "id": "ogm:789", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "ogm:456", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 3", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": true, "id": "ogm:789", "locn_geometry": ""}
85 changes: 82 additions & 3 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from pathlib import Path

import pytest
Expand Down Expand Up @@ -32,7 +33,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert transformer.get_required_fields(next(aardvark_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/gismit:123",
"source_link": "https://geodata.libraries.mit.edu/record/abc:123",
"timdex_record_id": "cool-repo:123",
"title": "Test title 1",
}
Expand All @@ -42,12 +43,21 @@ def test_aardvark_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/gismit:123",
source_link="https://geodata.libraries.mit.edu/record/abc:123",
timdex_record_id="cool-repo:123",
title="Test title 1",
citation="Test title 1. Geospatial data. https://example.com/gismit:123",
citation="Test title 1. Geospatial data. "
"https://geodata.libraries.mit.edu/record/abc:123",
content_type=["Geospatial data"],
rights=[timdex.Rights(description="Access rights", kind="Access")],
links=[
timdex.Link(
url="https://geodata.libraries.mit.edu/record/abc:123",
kind="Website",
restrictions=None,
text="Website",
)
],
)


Expand Down Expand Up @@ -137,6 +147,39 @@ def test_aardvark_get_dates_success(aardvark_record_all_fields):
]


def test_aardvark_get_dates_drops_dates_with_invalid_strings(
caplog, aardvark_record_all_fields
):
caplog.set_level("DEBUG")
record = next(aardvark_record_all_fields)
record["dct_issued_s"] = "1933?" # dropped
record["dct_temporal_sm"] = [
"2000-01-01",
"1999",
"approximately 1569", # dropped
"absolute junky date", # dropped
]
record["gbl_dateRange_drsim"] = [
"[1943 TO 1946]",
"[apples TO oranges]", # logged and dropped
]
assert MITAardvark.get_dates(record, "123") == [
timdex.Date(kind="Coverage", note=None, range=None, value="2000-01-01"),
timdex.Date(kind="Coverage", note=None, range=None, value="1999"),
timdex.Date(kind="Coverage", note=None, range=None, value="1943"),
timdex.Date(kind="Coverage", note=None, range=None, value="1944"),
timdex.Date(kind="Coverage", note=None, range=None, value="1945"),
timdex.Date(kind="Coverage", note=None, range=None, value="1946"),
timdex.Date(
kind="Coverage",
note=None,
range=timdex.DateRange(gt=None, gte="1943", lt=None, lte="1946"),
value=None,
),
]
assert "Unable to parse date range string" in caplog.text


def test_aardvark_parse_solr_date_range_string_success():
assert MITAardvark.parse_solr_date_range_string("[1932 TO 1937]", "123") == (
"1932",
Expand Down Expand Up @@ -277,3 +320,39 @@ def test_aardvark_get_subjects_success(aardvark_record_all_fields):
timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
]


def test_aardvark_record_get_source_link_success(
aardvark_record_all_fields,
):
record = next(aardvark_record_all_fields)
url_from_aardvark_record = "https://geodata.libraries.mit.edu/record/abc:123"
record["dct_references_s"] = json.dumps(
{"http://schema.org/url": url_from_aardvark_record}
)
assert (
MITAardvark.get_source_link(
"None",
"abc:123",
record,
)
== url_from_aardvark_record
)


def test_aardvark_record_get_source_link_bad_dct_references_s_raises_error(
aardvark_record_all_fields,
):
record = next(aardvark_record_all_fields)
record["dct_references_s"] = json.dumps(
{"missing data": "from aardvark from geoharvester"}
)
with pytest.raises(
ValueError,
match="Could not locate a kind=Website link to pull the source link from.",
):
MITAardvark.get_source_link(
"None",
"abc:123",
record,
)
6 changes: 3 additions & 3 deletions transmogrifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,13 @@
},
"gismit": {
"name": "MIT GIS Resources",
"base-url": "https://search.libraries.mit.edu/record/",
"base-url": "None",
"transform-class": "transmogrifier.sources.json.aardvark.MITAardvark",
},
"gisogm": {
"name": "OpenGeoMetadata GIS Resources",
"base-url": "https://search.libraries.mit.edu/record/",
"transform-class": "transmogrifier.sources.json.aardvark.OGMAardvark",
"base-url": "None",
"transform-class": "transmogrifier.sources.json.aardvark.MITAardvark",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, OGMAardvark was referenced here before being defined which we were expecting to need because of the get_source_link method including gismit, are we thinking differently about that approach now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really glad you raised this up, for a couple of reasons.

First of all, looking more closely at the get_source_link which I had not thought to check, I believe that we can update that for MITAardvark transformations to use data from dct_references_s. In this way, it also helps explain why MIT and OGM records can share the same transformer.

Every record that comes out of GeoHarvester is an "MIT" Aardvark record in the sense that, regardless of origin institution or metadata format, we have crafted the Aardvark file in a way that meets our TIMDEX needs. During that work in GeoHarvester, quite a bit of care is taken to craft the dct_references_s field which contains URLs.

The value for dct_refereces_s['http://schema.org/url'] is what the "source link" for the record should be. For MIT records this will be https://geodata.libraries.mit.edu/record/<IDENTIFIER> and for OGM records it will be an external URL that we extracted from the source metadata; gauranteed to be present or it does not get included in the harvester output.

Taking all this together, will work on another commit that:

  1. updates get_source_link to actually read data from the record
  2. will remove base_url from the gismit and gisogm configurations, as it's not needed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ehanson8 - just pushed this commit and have re-requested review.

},
"researchdatabases": {
"name": "Research Databases",
Expand Down
49 changes: 38 additions & 11 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re

import transmogrifier.models as timdex
from transmogrifier.helpers import validate_date
from transmogrifier.sources.transformer import JSON, JSONTransformer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -31,25 +32,33 @@ def get_main_titles(cls, source_record: dict) -> list[str]:

@classmethod
def get_source_link(
cls, source_base_url: str, source_record_id: str, source_record: dict[str, JSON]
cls,
_source_base_url: str,
source_record_id: str,
source_record: dict[str, JSON],
) -> str:
"""
Class method to set the source link for the item.

May be overridden by source subclasses if needed.

Default behavior is to concatenate the source base URL + source record id.
Unlike other Transmogrifier sources that dynamically build a source link,
MITAardvark files are expected to have a fully formed and appropriate source link
in the metadata already. This method relies on that data.

Args:
source_base_url: Source base URL.
_source_base_url: Source base URL. Not used for MITAardvark transforms.
source_record_id: Record identifier for the source record.
source_record: A BeautifulSoup Tag representing a single XML record.
- not used by default implementation, but could be useful for subclass
overrides
"""
return source_base_url + cls.get_timdex_record_id(
"gismit", source_record_id, source_record
)
links = cls.get_links(source_record, source_record_id)
url_links = [link for link in links if link.kind == "Website"]
if len(url_links) == 1:
return url_links[0].url
message = "Could not locate a kind=Website link to pull the source link from."
raise ValueError(message)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perfect, much better than what we were doing before


@classmethod
def get_timdex_record_id(
Expand Down Expand Up @@ -183,12 +192,25 @@ def get_contributors(source_record: dict) -> list[timdex.Contributor]:

@classmethod
def get_dates(cls, source_record: dict, source_record_id: str) -> list[timdex.Date]:
"""Get values from source record for TIMDEX dates field."""
return (
"""Get values from source record for TIMDEX dates field.

This method aggregates dates from a variety of Aardvark fields. Once aggregated,
the results are filtered to allow only well formed DateRanges or validated date
strings.
"""
dates = (
cls._issued_dates(source_record)
+ cls._coverage_dates(source_record)
+ cls._range_dates(source_record, source_record_id)
)
return [
date
for date in dates
# skip value validation for DateRange type dates
if isinstance(date.range, timdex.DateRange)
# validate date string if not None
or (date.value is not None and validate_date(date.value, source_record_id))
]

@classmethod
def _issued_dates(cls, source_record: dict) -> list[timdex.Date]:
Expand Down Expand Up @@ -228,9 +250,13 @@ def _range_dates(
"""Get values for issued dates."""
range_dates = []
for date_range_string in source_record.get("gbl_dateRange_drsim", []):
date_range_values = cls.parse_solr_date_range_string(
date_range_string, source_record_id
)
try:
date_range_values = cls.parse_solr_date_range_string(
date_range_string, source_record_id
)
except ValueError as exc:
logger.warning(exc)
continue
range_dates.append(
timdex.Date(
kind="Coverage",
Expand Down Expand Up @@ -292,6 +318,7 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
url=link.get("url"), kind="Download", text=link.get("label")
)
for link in links_object.get("http://schema.org/downloadUrl", [])
if isinstance(link.get("url", {}), str)
]
)
if schema_url := links_object.get("http://schema.org/url"):
Expand Down
Loading