Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdt 54 aardvark transform #108

Merged
merged 5 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def datacite_record_all_fields():


@pytest.fixture()
def json_records():
return JsonTransformer.parse_source_file("tests/fixtures/json_records.jsonl")
def aardvark_records():
return JsonTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")


@pytest.fixture()
Expand Down
2 changes: 2 additions & 0 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "123", "dct_title_s": "Test title 1"}
{"id": "456", "dct_title_s": "Test title 2"}
2 changes: 0 additions & 2 deletions tests/fixtures/json_records.jsonl

This file was deleted.

24 changes: 12 additions & 12 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,39 @@
import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import Aardvark
from transmogrifier.sources.json.aardvark import MITAardvark


def test_aardvark_get_required_fields_returns_expected_values(json_records):
transformer = Aardvark("cool-repo", json_records)
assert transformer.get_required_fields(next(json_records)) == {
def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert transformer.get_required_fields(next(aardvark_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/123",
"timdex_record_id": "cool-repo:123",
"title": "Title not provided",
"title": "Test title 1",
}


def test_jsontransformer_transform_returns_timdex_record(json_records):
transformer = Aardvark("cool-repo", json_records)
def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/123",
timdex_record_id="cool-repo:123",
title="Title not provided",
citation="Title not provided. Geospatial data. https://example.com/123",
title="Test title 1",
citation="Test title 1. Geospatial data. https://example.com/123",
content_type=["Geospatial data"],
)


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert Aardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]
assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert Aardvark.get_source_record_id(aardvark_record_all_fields) == "123"
assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123"


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert Aardvark.get_subjects(aardvark_record_all_fields) == [
assert MITAardvark.get_subjects(aardvark_record_all_fields) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
Expand Down
31 changes: 21 additions & 10 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,26 @@
logger = logging.getLogger(__name__)


class Aardvark(JsonTransformer):
"""Aardvark transformer."""
class MITAardvark(JsonTransformer):
"""MITAardvark transformer.

MIT Aardvark records have more required fields than standard Aardvark records
as detailed here in the geo-harvester's MITAardvark class:

https://github.com/MITLibraries/geo-harvester/blob/main/harvester/records/record.py
"""

@classmethod
def get_main_titles(cls, source_record: dict) -> list[str]:
"""
Retrieve main title(s) from a Aardvark JSON record.
Retrieve main title(s) from a MITAardvark JSON record.

Overrides metaclass get_main_titles() method.

Args:
source_record: A JSON object representing a source record.
"""
titles = []
if title := "dct_title_s" in source_record and source_record["dct_title_s"]:
titles.append(title)
return titles
return [source_record["dct_title_s"]]

@classmethod
def get_source_record_id(cls, source_record: dict) -> str:
Expand All @@ -39,7 +42,7 @@ def record_is_deleted(cls, source_record: dict) -> bool:
"""
Determine whether record has a status of deleted.

## WIP - defining to enable instantiation of Aardvark instance.
## WIP - defining to enable instantiation of MITAardvark instance.

Args:
source_record: A JSON object representing a source record.
Expand All @@ -58,7 +61,7 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
"""
fields: dict = {}

# alternate_titles field not used in Aardvark
# alternate_titles

# content_type
fields["content_type"] = ["Geospatial data"]
Expand All @@ -76,7 +79,7 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
# identifiers

# languages
fields["languages"] = source_record.get("dct_langauge_sm")
fields["languages"] = source_record.get("dct_language_sm")

# links

Expand All @@ -100,6 +103,14 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
def get_subjects(source_record: dict) -> list[timdex.Subject]:
"""Get values from source record for TIMDEX subjects field.

Unlike other TIMDEX sources, the subject scheme is not known
for each term. The kind here represents the uncontrolled field
in which the term was found.

DCAT Keyword: https://www.w3.org/TR/vocab-dcat-2/#Property:resource_keyword
DCAT Theme: https://www.w3.org/TR/vocab-dcat-2/#Property:resource_theme
Dublin Core Subject: http://purl.org/dc/terms/subject
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

Args:
source_record: A JSON object representing a source record.
"""
Expand Down
89 changes: 42 additions & 47 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,10 +224,45 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON] | Tag]:
"""
pass

@final
def _transform(
self, source_record: dict[str, JSON] | Tag
) -> Optional[TimdexRecord]:
"""
Private method called by Transform a source record into a TIMDEX record.
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

May not be overridden.

Args:
source_record: A single source record.
"""
if self.record_is_deleted(source_record):
source_record_id = self.get_source_record_id(source_record)
timdex_record_id = self.get_timdex_record_id(
self.source, source_record_id, source_record
)
raise DeletedRecord(timdex_record_id)
optional_fields = self.get_optional_fields(source_record)
if optional_fields is None:
return None
else:
fields = {
**self.get_required_fields(source_record),
**optional_fields,
}

# If citation field was not present, generate citation from other fields
if fields.get("citation") is None:
fields["citation"] = generate_citation(fields)
if fields.get("content_type") is None:
fields["content_type"] = ["Not specified"]

return TimdexRecord(**fields)

@abstractmethod
def transform(self, source_record: dict[str, JSON] | Tag) -> Optional[TimdexRecord]:
"""
Transform a source record into a TIMDEX record.
Call Transformer._transform method to transform source record to TIMDEX record.
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

Must be overridden by format subclasses.

Expand Down Expand Up @@ -349,6 +384,8 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON]]:

May not be overridden.

Validates that records in the file are dicts for proper processing.

Args:
source_file: A file containing source records to be transformed.
"""
Expand All @@ -359,35 +396,14 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON]]:
@final
def transform(self, source_record: dict[str, JSON]) -> Optional[TimdexRecord]:
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
"""
Transform a JSON record into a TIMDEX record.
Call Transformer._transform method to transform JSON record to TIMDEX record.

May not be overridden.

Args:
source_record: A JSON object representing a source record.
"""
if self.record_is_deleted(source_record):
source_record_id = self.get_source_record_id(source_record)
timdex_record_id = self.get_timdex_record_id(
self.source, source_record_id, source_record
)
raise DeletedRecord(timdex_record_id)
optional_fields = self.get_optional_fields(source_record)
if optional_fields is None:
return None
else:
fields = {
**self.get_required_fields(source_record),
**optional_fields,
}

# If citation field was not present, generate citation from other fields
if fields.get("citation") is None:
fields["citation"] = generate_citation(fields)
if fields.get("content_type") is None:
fields["content_type"] = ["Not specified"]

return TimdexRecord(**fields)
return self._transform(source_record)

@final
def get_required_fields(self, source_record: dict[str, JSON]) -> dict:
Expand Down Expand Up @@ -537,35 +553,14 @@ def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
@final
def transform(self, source_record: Tag) -> Optional[TimdexRecord]:
"""
Transform an XML record into a TIMDEX record.
Call Transformer._transform method to transform XML record to TIMDEX record.

May not be overridden.

Args:
source_record: A BeautifulSoup Tag representing a single XML record.
"""
if self.record_is_deleted(source_record):
source_record_id = self.get_source_record_id(source_record)
timdex_record_id = self.get_timdex_record_id(
self.source, source_record_id, source_record
)
raise DeletedRecord(timdex_record_id)
optional_fields = self.get_optional_fields(source_record)
if optional_fields is None:
return None
else:
fields = {
**self.get_required_fields(source_record),
**optional_fields,
}

# If citation field was not present, generate citation from other fields
if fields.get("citation") is None:
fields["citation"] = generate_citation(fields)
if fields.get("content_type") is None:
fields["content_type"] = ["Not specified"]

return TimdexRecord(**fields)
return self._transform(source_record)

@final
def get_required_fields(self, source_record: Tag) -> dict:
Expand Down
Loading