Skip to content

Commit

Permalink
Update Location class to use WKT strings
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* OpenSearch can parse the WKT strings that are found in MITAardvark records so the values don't require the previously expected parsing.

How this addresses that need:
* Rename Location.geodata attribute to geoshape as well as update corresponding unit tests and fixtures
* Refactor get_locations method to store WKT strings as well as update corresponding unit tests and fixtures
* Remove parse_geodata_string from helpers.py and corresponding unit tests
* Add default kind value to get_identifiers method

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-116
  • Loading branch information
ehanson8 committed Jan 11, 2024
1 parent 9e8a516 commit 9dde779
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 72 deletions.
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def timdex_record_all_fields_and_subfields():
timdex.Location(
value="A point on the globe",
kind="Data was gathered here",
geodata=[-77.025955, 38.942142],
geoshape="BBOX(-77.025955, 38.942142)",
)
],
notes=[timdex.Note(value=["This book is awesome"], kind="opinion")],
Expand Down
29 changes: 24 additions & 5 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,30 @@ def test_aardvark_get_links_logs_warning_for_invalid_json(caplog):
)


def test_aardvark_get_locations_success(caplog, aardvark_record_all_fields):
caplog.set_level("DEBUG")
assert "Geometry field 'dcat_bbox' found, but currently not mapped."
assert "Geometry field 'locn_geometry' found, but currently not mapped."
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == []
def test_aardvark_get_locations_success(aardvark_record_all_fields):
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [
timdex.Location(
kind="Bounding Box", geoshape="BBOX(-111.1, -104.0, 45.0, 40.9)"
),
timdex.Location(kind="Geometry", geoshape="BBOX(-111.1, -104.0, 45.0, 40.9)"),
]


def test_parse_get_locations_string_invalid_geostring_logs_warning(
aardvark_record_all_fields, caplog
):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["dcat_bbox"] = "Invalid"
aardvark_record["locn_geometry"] = "Invalid"
assert MITAardvark.get_locations(aardvark_record, "123") == []
assert (
"Record ID '123': Unable to parse geodata string 'Invalid' in 'dcat_bbox'"
in caplog.text
)
assert (
"Record ID '123': Unable to parse geodata string 'Invalid' in 'locn_geometry'"
in caplog.text
)


def test_aardvark_get_notes_success(aardvark_record_all_fields):
Expand Down
20 changes: 0 additions & 20 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from datetime import datetime

import pytest

import transmogrifier.models as timdex
from transmogrifier.helpers import (
generate_citation,
parse_date_from_string,
parse_geodata_string,
validate_date,
validate_date_range,
)
Expand Down Expand Up @@ -259,23 +256,6 @@ def test_parse_date_from_string_invalid_date_returns_none():
assert parse_date_from_string("circa 1930s") is None


def test_parse_geodata_string_success():
assert parse_geodata_string("ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "123") == [
-111.1,
-104.0,
45.0,
40.9,
]


def test_parse_geodata_string_invalid_geodata_string_raises_error():
with pytest.raises(
ValueError,
match="Record ID '123': Unable to parse geodata string 'Invalid'",
):
parse_geodata_string("Invalid", "123")


def test_validate_date_success():
assert validate_date("1930", "1234") is True

Expand Down
10 changes: 5 additions & 5 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ def test_timdex_record_all_fields_and_subfields(timdex_record_all_fields_and_sub
timdex_record_all_fields_and_subfields.locations[0].kind
== "Data was gathered here"
)
assert timdex_record_all_fields_and_subfields.locations[0].geodata == [
-77.025955,
38.942142,
]
assert (
timdex_record_all_fields_and_subfields.locations[0].geoshape
== "BBOX(-77.025955, 38.942142)"
)
assert timdex_record_all_fields_and_subfields.notes[0].value == [
"This book is awesome"
]
Expand Down Expand Up @@ -305,7 +305,7 @@ def test_record_asdict_includes_all_fields(timdex_record_all_fields_and_subfield
"literary_form": "nonfiction",
"locations": [
{
"geodata": [-77.025955, 38.942142],
"geoshape": "BBOX(-77.025955, 38.942142)",
"kind": "Data was gathered here",
"value": "A point on the globe",
}
Expand Down
25 changes: 0 additions & 25 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,31 +75,6 @@ def parse_date_from_string(
return None


def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[float]:
"""Get list of values from a formatted geodata string.
Example:
- "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"
Args:
geodata_string: Formatted geodata string to parse.
source_record_id: The ID of the record containing the string to parse.
"""
geodata_points = []
try:
raw_geodata_points = geodata_string.split("(")[-1].split(")")[0].split(",")
stripped_geodata_points = map(str.strip, raw_geodata_points)
geodata_floats = list(map(float, stripped_geodata_points))
geodata_points.extend(geodata_floats)
except ValueError:
message = (
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}'"
)
raise ValueError(message)
return geodata_points


def validate_date(
date_string: str,
source_record_id: str,
Expand Down
4 changes: 1 addition & 3 deletions transmogrifier/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,7 @@ class Link:
class Location:
value: Optional[str] = field(default=None, validator=optional(instance_of(str)))
kind: Optional[str] = field(default=None, validator=optional(instance_of(str)))
geodata: Optional[list[float]] = field(
default=None, validator=optional(list_of(float))
)
geoshape: Optional[str] = field(default=None, validator=optional(instance_of(str)))


@define
Expand Down
29 changes: 16 additions & 13 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,13 +294,8 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
def get_locations(
source_record: dict, source_record_id: str
) -> list[timdex.Location]:
"""Get values from source record for TIMDEX locations field.
WIP: Currently in the process of determining our approach for storing geographic
geometry data in the TIMDEX record and how this dovetails with the OpenSearch
mapping. At this time, this method returns an empty list of Locations.
"""
locations: list[timdex.Location] = []
"""Get values from source record for TIMDEX locations field."""
locations = []

aardvark_location_fields = {
"dcat_bbox": "Bounding Box",
Expand All @@ -309,14 +304,22 @@ def get_locations(
for aardvark_location_field, kind_value in aardvark_location_fields.items():
if aardvark_location_field not in source_record:
continue
try:
if (
geodata_string := source_record[aardvark_location_field]
) and "ENVELOPE" in source_record[aardvark_location_field]:
locations.append(
timdex.Location(
geoshape=geodata_string.replace("ENVELOPE", "BBOX"),
kind=kind_value,
)
)
else:
message = (
f"Geometry field '{aardvark_location_field}' found, but "
f"currently not mapped."
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}' "
f"in '{aardvark_location_field}'"
)
logger.debug(message)
except ValueError as exception:
logger.warning(exception)
logger.warning(message)
return locations

@staticmethod
Expand Down

0 comments on commit 9dde779

Please sign in to comment.