Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdt 54 aardvark transform #108

Merged
merged 5 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ name = "pypi"
attrs = "*"
beautifulsoup4 = "*"
click = "*"
jsonlines = "*"
lxml = "*"
python-dateutil = "*"
sentry-sdk = "*"
smart-open = {version = "*", extras = ["s3"]}
python-dateutil = "*"
types-python-dateutil = "*"

[dev-packages]
Expand Down
568 changes: 461 additions & 107 deletions Pipfile.lock

Large diffs are not rendered by default.

16 changes: 15 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import transmogrifier.models as timdex
from transmogrifier.config import SOURCES, load_external_config
from transmogrifier.sources.transformer import XmlTransformer
from transmogrifier.sources.transformer import JsonTransformer, XmlTransformer
from transmogrifier.sources.xml.datacite import Datacite


Expand Down Expand Up @@ -46,6 +46,15 @@ def runner():
return CliRunner()


@pytest.fixture
def aardvark_record_all_fields():
return next(
JsonTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)
)


@pytest.fixture()
def datacite_records():
return XmlTransformer.parse_source_file(
Expand All @@ -61,6 +70,11 @@ def datacite_record_all_fields():
return Datacite("cool-repo", source_records)


@pytest.fixture()
def json_records():
return JsonTransformer.parse_source_file("tests/fixtures/json_records.jsonl")


@pytest.fixture()
def loc_country_crosswalk():
return load_external_config("config/loc-countries.xml", "xml")
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
2 changes: 2 additions & 0 deletions tests/fixtures/json_records.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "123"}
{"id": "456"}
43 changes: 43 additions & 0 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import Aardvark


def test_aardvark_get_required_fields_returns_expected_values(json_records):
transformer = Aardvark("cool-repo", json_records)
assert transformer.get_required_fields(next(json_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/123",
"timdex_record_id": "cool-repo:123",
"title": "Title not provided",
}


def test_jsontransformer_transform_returns_timdex_record(json_records):
transformer = Aardvark("cool-repo", json_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/123",
timdex_record_id="cool-repo:123",
title="Title not provided",
citation="Title not provided. Geospatial data. https://example.com/123",
content_type=["Geospatial data"],
)


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert Aardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert Aardvark.get_source_record_id(aardvark_record_all_fields) == "123"


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert Aardvark.get_subjects(aardvark_record_all_fields) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
timdex.Subject(value=["Earth"], kind="Dublin Core Subject"),
timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
]
121 changes: 121 additions & 0 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import logging

import transmogrifier.models as timdex
from transmogrifier.sources.transformer import JsonTransformer

logger = logging.getLogger(__name__)


class Aardvark(JsonTransformer):
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
"""Aardvark transformer."""

@classmethod
def get_main_titles(cls, source_record: dict) -> list[str]:
"""
Retrieve main title(s) from a Aardvark JSON record.

Overrides metaclass get_main_titles() method.

Args:
source_record: A JSON object representing a source record.
"""
titles = []
if title := "dct_title_s" in source_record and source_record["dct_title_s"]:
titles.append(title)
return titles
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def get_source_record_id(cls, source_record: dict) -> str:
"""
Get source record ID from a JSON record.

Args:
source_record: A JSON object representing a source record.
"""
return source_record["id"]

@classmethod
def record_is_deleted(cls, source_record: dict) -> bool:
"""
Determine whether record has a status of deleted.

## WIP - defining to enable instantiation of Aardvark instance.

Args:
source_record: A JSON object representing a source record.
"""
return False
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

def get_optional_fields(self, source_record: dict) -> dict | None:
"""
Retrieve optional TIMDEX fields from a Aardvar JSON record.

Overrides metaclass get_optional_fields() method.

Args:
xml: A BeautifulSoup Tag representing a single Datacite record in
oai_datacite XML.
"""
fields: dict = {}

# alternate_titles field not used in Aardvark
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

# content_type
fields["content_type"] = ["Geospatial data"]

# contributors

ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
# dates

# edition

# format

# funding_information

# identifiers

# languages
fields["languages"] = source_record.get("dct_langauge_sm")
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

# links

# locations

# notes

# publication_information

# related_items

# rights

# subjects
fields["subjects"] = self.get_subjects(source_record) or None

# summary field
return fields

@staticmethod
def get_subjects(source_record: dict) -> list[timdex.Subject]:
"""Get values from source record for TIMDEX subjects field.

Args:
source_record: A JSON object representing a source record.
"""
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
subjects = []
aardvark_subject_fields = {
"dcat_keyword_sm": "DCAT Keyword",
"dcat_theme_sm": "DCAT Theme",
"dct_subject_sm": "Dublin Core Subject",
"gbl_resourceClass_sm": "Subject scheme not provided",
"gbl_resourceType_sm": "Subject scheme not provided",
}
for aardvark_subject_field, kind_value in {
key: value
for key, value in aardvark_subject_fields.items()
if key in source_record
}.items():
for subject in source_record[aardvark_subject_field]:
subjects.append(timdex.Subject(value=[subject], kind=kind_value))
return subjects
Loading
Loading