diff --git a/tests/fixtures/aardvark_records.jsonl b/tests/fixtures/aardvark_records.jsonl index 7b54bb8..bbe33ee 100644 --- a/tests/fixtures/aardvark_records.jsonl +++ b/tests/fixtures/aardvark_records.jsonl @@ -1,3 +1,3 @@ -{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "mit:123", "locn_geometry": ""} -{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "ogm:456", "locn_geometry": ""} -{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 3", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": true, "id": "ogm:789", "locn_geometry": ""} +{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "mit:123", "locn_geometry": ""} +{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "ogm:456", "locn_geometry": ""} +{"dct_accessRights_s": "Access rights", "dct_references_s": "{\"http://schema.org/url\": \"https://geodata.libraries.mit.edu/record/abc:123\"}", "dct_title_s": "Test title 3", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": true, "id": "ogm:789", "locn_geometry": ""} \ No newline at end of file diff --git a/tests/sources/json/test_aardvark.py b/tests/sources/json/test_aardvark.py index 424cc5f..e4ab4fd 100644 --- a/tests/sources/json/test_aardvark.py +++ b/tests/sources/json/test_aardvark.py @@ -1,3 +1,4 @@ +import json from pathlib import Path import pytest @@ -32,7 +33,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records): transformer = MITAardvark("cool-repo", aardvark_records) assert transformer.get_required_fields(next(aardvark_records)) == { "source": "A Cool Repository", - "source_link": "https://example.com/gismit:123", + "source_link": "https://geodata.libraries.mit.edu/record/abc:123", "timdex_record_id": "cool-repo:123", "title": "Test title 1", } @@ -42,12 +43,21 @@ def test_aardvark_transform_returns_timdex_record(aardvark_records): transformer = MITAardvark("cool-repo", aardvark_records) assert next(transformer) == timdex.TimdexRecord( source="A Cool Repository", - source_link="https://example.com/gismit:123", + source_link="https://geodata.libraries.mit.edu/record/abc:123", timdex_record_id="cool-repo:123", title="Test title 1", - citation="Test title 1. Geospatial data. https://example.com/gismit:123", + citation="Test title 1. Geospatial data. " + "https://geodata.libraries.mit.edu/record/abc:123", content_type=["Geospatial data"], rights=[timdex.Rights(description="Access rights", kind="Access")], + links=[ + timdex.Link( + url="https://geodata.libraries.mit.edu/record/abc:123", + kind="Website", + restrictions=None, + text="Website", + ) + ], ) @@ -137,6 +147,39 @@ def test_aardvark_get_dates_success(aardvark_record_all_fields): ] +def test_aardvark_get_dates_drops_dates_with_invalid_strings( + caplog, aardvark_record_all_fields +): + caplog.set_level("DEBUG") + record = next(aardvark_record_all_fields) + record["dct_issued_s"] = "1933?" # dropped + record["dct_temporal_sm"] = [ + "2000-01-01", + "1999", + "approximately 1569", # dropped + "absolute junky date", # dropped + ] + record["gbl_dateRange_drsim"] = [ + "[1943 TO 1946]", + "[apples TO oranges]", # logged and dropped + ] + assert MITAardvark.get_dates(record, "123") == [ + timdex.Date(kind="Coverage", note=None, range=None, value="2000-01-01"), + timdex.Date(kind="Coverage", note=None, range=None, value="1999"), + timdex.Date(kind="Coverage", note=None, range=None, value="1943"), + timdex.Date(kind="Coverage", note=None, range=None, value="1944"), + timdex.Date(kind="Coverage", note=None, range=None, value="1945"), + timdex.Date(kind="Coverage", note=None, range=None, value="1946"), + timdex.Date( + kind="Coverage", + note=None, + range=timdex.DateRange(gt=None, gte="1943", lt=None, lte="1946"), + value=None, + ), + ] + assert "Unable to parse date range string" in caplog.text + + def test_aardvark_parse_solr_date_range_string_success(): assert MITAardvark.parse_solr_date_range_string("[1932 TO 1937]", "123") == ( "1932", @@ -277,3 +320,39 @@ def test_aardvark_get_subjects_success(aardvark_record_all_fields): timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"), timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"), ] + + +def test_aardvark_record_get_source_link_success( + aardvark_record_all_fields, +): + record = next(aardvark_record_all_fields) + url_from_aardvark_record = "https://geodata.libraries.mit.edu/record/abc:123" + record["dct_references_s"] = json.dumps( + {"http://schema.org/url": url_from_aardvark_record} + ) + assert ( + MITAardvark.get_source_link( + "None", + "abc:123", + record, + ) + == url_from_aardvark_record + ) + + +def test_aardvark_record_get_source_link_bad_dct_references_s_raises_error( + aardvark_record_all_fields, +): + record = next(aardvark_record_all_fields) + record["dct_references_s"] = json.dumps( + {"missing data": "from aardvark from geoharvester"} + ) + with pytest.raises( + ValueError, + match="Could not locate a kind=Website link to pull the source link from.", + ): + MITAardvark.get_source_link( + "None", + "abc:123", + record, + ) diff --git a/transmogrifier/config.py b/transmogrifier/config.py index 33f2f93..deeab5a 100644 --- a/transmogrifier/config.py +++ b/transmogrifier/config.py @@ -106,13 +106,13 @@ }, "gismit": { "name": "MIT GIS Resources", - "base-url": "https://search.libraries.mit.edu/record/", + "base-url": "None", "transform-class": "transmogrifier.sources.json.aardvark.MITAardvark", }, "gisogm": { "name": "OpenGeoMetadata GIS Resources", - "base-url": "https://search.libraries.mit.edu/record/", - "transform-class": "transmogrifier.sources.json.aardvark.OGMAardvark", + "base-url": "None", + "transform-class": "transmogrifier.sources.json.aardvark.MITAardvark", }, "researchdatabases": { "name": "Research Databases", diff --git a/transmogrifier/sources/json/aardvark.py b/transmogrifier/sources/json/aardvark.py index 149f679..55d512e 100644 --- a/transmogrifier/sources/json/aardvark.py +++ b/transmogrifier/sources/json/aardvark.py @@ -3,6 +3,7 @@ import re import transmogrifier.models as timdex +from transmogrifier.helpers import validate_date from transmogrifier.sources.transformer import JSON, JSONTransformer logger = logging.getLogger(__name__) @@ -31,25 +32,33 @@ def get_main_titles(cls, source_record: dict) -> list[str]: @classmethod def get_source_link( - cls, source_base_url: str, source_record_id: str, source_record: dict[str, JSON] + cls, + _source_base_url: str, + source_record_id: str, + source_record: dict[str, JSON], ) -> str: """ Class method to set the source link for the item. May be overridden by source subclasses if needed. - Default behavior is to concatenate the source base URL + source record id. + Unlike other Transmogrifier sources that dynamically build a source link, + MITAardvark files are expected to have a fully formed and appropriate source link + in the metadata already. This method relies on that data. Args: - source_base_url: Source base URL. + _source_base_url: Source base URL. Not used for MITAardvark transforms. source_record_id: Record identifier for the source record. source_record: A BeautifulSoup Tag representing a single XML record. - not used by default implementation, but could be useful for subclass overrides """ - return source_base_url + cls.get_timdex_record_id( - "gismit", source_record_id, source_record - ) + links = cls.get_links(source_record, source_record_id) + url_links = [link for link in links if link.kind == "Website"] + if len(url_links) == 1: + return url_links[0].url + message = "Could not locate a kind=Website link to pull the source link from." + raise ValueError(message) @classmethod def get_timdex_record_id( @@ -183,12 +192,25 @@ def get_contributors(source_record: dict) -> list[timdex.Contributor]: @classmethod def get_dates(cls, source_record: dict, source_record_id: str) -> list[timdex.Date]: - """Get values from source record for TIMDEX dates field.""" - return ( + """Get values from source record for TIMDEX dates field. + + This method aggregates dates from a variety of Aardvark fields. Once aggregated, + the results are filtered to allow only well formed DateRanges or validated date + strings. + """ + dates = ( cls._issued_dates(source_record) + cls._coverage_dates(source_record) + cls._range_dates(source_record, source_record_id) ) + return [ + date + for date in dates + # skip value validation for DateRange type dates + if isinstance(date.range, timdex.DateRange) + # validate date string if not None + or (date.value is not None and validate_date(date.value, source_record_id)) + ] @classmethod def _issued_dates(cls, source_record: dict) -> list[timdex.Date]: @@ -228,9 +250,13 @@ def _range_dates( """Get values for issued dates.""" range_dates = [] for date_range_string in source_record.get("gbl_dateRange_drsim", []): - date_range_values = cls.parse_solr_date_range_string( - date_range_string, source_record_id - ) + try: + date_range_values = cls.parse_solr_date_range_string( + date_range_string, source_record_id + ) + except ValueError as exc: + logger.warning(exc) + continue range_dates.append( timdex.Date( kind="Coverage", @@ -292,6 +318,7 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]: url=link.get("url"), kind="Download", text=link.get("label") ) for link in links_object.get("http://schema.org/downloadUrl", []) + if isinstance(link.get("url", {}), str) ] ) if schema_url := links_object.get("http://schema.org/url"):