MITLibraries · ehanson8 · Dec 18, 2023 · Dec 12, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/Pipfile b/Pipfile
@@ -7,10 +7,11 @@ name = "pypi"
 attrs = "*"
 beautifulsoup4 = "*"
 click = "*"
+jsonlines = "*"
 lxml = "*"
+python-dateutil = "*"
 sentry-sdk = "*"
 smart-open = {version = "*", extras = ["s3"]}
-python-dateutil = "*"
 types-python-dateutil = "*"
 
 [dev-packages]

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,7 +5,7 @@
 
 import transmogrifier.models as timdex
 from transmogrifier.config import SOURCES, load_external_config
-from transmogrifier.sources.transformer import XmlTransformer
+from transmogrifier.sources.transformer import JsonTransformer, XmlTransformer
 from transmogrifier.sources.xml.datacite import Datacite
 
 
@@ -46,6 +46,15 @@ def runner():
     return CliRunner()
 
 
+@pytest.fixture
+def aardvark_record_all_fields():
+    return next(
+        JsonTransformer.parse_source_file(
+            "tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
+        )
+    )
+
+
 @pytest.fixture()
 def datacite_records():
     return XmlTransformer.parse_source_file(
@@ -61,6 +70,11 @@ def datacite_record_all_fields():
     return Datacite("cool-repo", source_records)
 
 
+@pytest.fixture()
+def json_records():
+    return JsonTransformer.parse_source_file("tests/fixtures/json_records.jsonl")
+
+
 @pytest.fixture()
 def loc_country_crosswalk():
     return load_external_config("config/loc-countries.xml", "xml")

diff --git a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
@@ -0,0 +1 @@
+{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
diff --git a/tests/fixtures/json_records.jsonl b/tests/fixtures/json_records.jsonl
@@ -0,0 +1,2 @@
+{"id": "123"}
+{"id": "456"}
diff --git a/tests/sources/json/test_aardvark.py b/tests/sources/json/test_aardvark.py
@@ -0,0 +1,43 @@
+import transmogrifier.models as timdex
+from transmogrifier.sources.json.aardvark import Aardvark
+
+
+def test_aardvark_get_required_fields_returns_expected_values(json_records):
+    transformer = Aardvark("cool-repo", json_records)
+    assert transformer.get_required_fields(next(json_records)) == {
+        "source": "A Cool Repository",
+        "source_link": "https://example.com/123",
+        "timdex_record_id": "cool-repo:123",
+        "title": "Title not provided",
+    }
+
+
+def test_jsontransformer_transform_returns_timdex_record(json_records):
+    transformer = Aardvark("cool-repo", json_records)
+    assert next(transformer) == timdex.TimdexRecord(
+        source="A Cool Repository",
+        source_link="https://example.com/123",
+        timdex_record_id="cool-repo:123",
+        title="Title not provided",
+        citation="Title not provided. Geospatial data. https://example.com/123",
+        content_type=["Geospatial data"],
+    )
+
+
+def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
+    assert Aardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]
+
+
+def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
+    assert Aardvark.get_source_record_id(aardvark_record_all_fields) == "123"
+
+
+def test_aardvark_get_subjects_success(aardvark_record_all_fields):
+    assert Aardvark.get_subjects(aardvark_record_all_fields) == [
+        timdex.Subject(value=["Country"], kind="DCAT Keyword"),
+        timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
+        timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
+        timdex.Subject(value=["Earth"], kind="Dublin Core Subject"),
+        timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
+        timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
+    ]
diff --git a/transmogrifier/sources/json/aardvark.py b/transmogrifier/sources/json/aardvark.py
@@ -0,0 +1,121 @@
+import logging
+
+import transmogrifier.models as timdex
+from transmogrifier.sources.transformer import JsonTransformer
+
+logger = logging.getLogger(__name__)
+
+
+class Aardvark(JsonTransformer):
+    """Aardvark transformer."""
+
+    @classmethod
+    def get_main_titles(cls, source_record: dict) -> list[str]:
+        """
+        Retrieve main title(s) from a Aardvark JSON record.
+
+        Overrides metaclass get_main_titles() method.
+
+        Args:
+            source_record: A JSON object representing a source record.
+        """
+        titles = []
+        if title := "dct_title_s" in source_record and source_record["dct_title_s"]:
+            titles.append(title)
+        return titles
+
+    @classmethod
+    def get_source_record_id(cls, source_record: dict) -> str:
+        """
+        Get source record ID from a JSON record.
+
+        Args:
+            source_record: A JSON object representing a source record.
+        """
+        return source_record["id"]
+
+    @classmethod
+    def record_is_deleted(cls, source_record: dict) -> bool:
+        """
+        Determine whether record has a status of deleted.
+
+        ## WIP - defining to enable instantiation of Aardvark instance.
+
+        Args:
+            source_record: A JSON object representing a source record.
+        """
+        return False
+
+    def get_optional_fields(self, source_record: dict) -> dict | None:
+        """
+        Retrieve optional TIMDEX fields from a Aardvar JSON record.
+
+        Overrides metaclass get_optional_fields() method.
+
+        Args:
+            xml: A BeautifulSoup Tag representing a single Datacite record in
+                oai_datacite XML.
+        """
+        fields: dict = {}
+
+        # alternate_titles field not used in Aardvark
+
+        # content_type
+        fields["content_type"] = ["Geospatial data"]
+
+        # contributors
+
+        # dates
+
+        # edition
+
+        # format
+
+        # funding_information
+
+        # identifiers
+
+        # languages
+        fields["languages"] = source_record.get("dct_langauge_sm")
+
+        # links
+
+        # locations
+
+        # notes
+
+        # publication_information
+
+        # related_items
+
+        # rights
+
+        # subjects
+        fields["subjects"] = self.get_subjects(source_record) or None
+
+        # summary field
+        return fields
+
+    @staticmethod
+    def get_subjects(source_record: dict) -> list[timdex.Subject]:
+        """Get values from source record for TIMDEX subjects field.
+
+        Args:
+            source_record: A JSON object representing a source record.
+        """
+        subjects = []
+        aardvark_subject_fields = {
+            "dcat_keyword_sm": "DCAT Keyword",
+            "dcat_theme_sm": "DCAT Theme",
+            "dct_subject_sm": "Dublin Core Subject",
+            "gbl_resourceClass_sm": "Subject scheme not provided",
+            "gbl_resourceType_sm": "Subject scheme not provided",
+        }
+        for aardvark_subject_field, kind_value in {
+            key: value
+            for key, value in aardvark_subject_fields.items()
+            if key in source_record
+        }.items():
+            for subject in source_record[aardvark_subject_field]:
+                subjects.append(timdex.Subject(value=[subject], kind=kind_value))
+        return subjects