Skip to content

Commit

Permalink
Gdt 54 aardvark transform (#108)
Browse files Browse the repository at this point in the history
* Reorganize source around XML and JSON folders

* GDT-54 Create Aardvark transform

Why these changes are being introduced:
* This is the initial structure for the Aardvark transform class. The class will be expanded with new methods in subsequent commits.

How this addresses that need:
* Add jsonlines to Pipfile
* Add fixtures for aardvark and generic JSONLines files
* Update argument type hinting for Transformer and JsonTransformer classes to clarify expected content types
* Update JsonTransformer.parse_source_file method to use jsonlines library
* Add Aardvark class with get_main_titles, get_source_record_id, record_is_deleted (in progress), get_optional_fields (in progress), and get_subjects methods and corresponding unit tests

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-54

* Refactor unit test to resolve CI error

* Updates based on discussion in PR #108

* Update json_records fixture to aardvark_records for more accurate unit tests
* Rename Aardvark > MITAardvark to unify terminology across repos
* Update get_main_titles method to reflect it is a required field
* Update Aardvark method docstrings to provide greater context
* Add Transformer._transform method to minimize code duplication between JsonTransformer and XmlTransformer methods

* Update _transform method docstring
  • Loading branch information
ehanson8 authored Dec 18, 2023
1 parent 62d8e79 commit 1aba987
Show file tree
Hide file tree
Showing 30 changed files with 763 additions and 209 deletions.
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ name = "pypi"
attrs = "*"
beautifulsoup4 = "*"
click = "*"
jsonlines = "*"
lxml = "*"
python-dateutil = "*"
sentry-sdk = "*"
smart-open = {version = "*", extras = ["s3"]}
python-dateutil = "*"
types-python-dateutil = "*"

[dev-packages]
Expand Down
568 changes: 461 additions & 107 deletions Pipfile.lock

Large diffs are not rendered by default.

20 changes: 17 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import transmogrifier.models as timdex
from transmogrifier.config import SOURCES, load_external_config
from transmogrifier.sources.datacite import Datacite
from transmogrifier.sources.transformer import XmlTransformer
from transmogrifier.sources.transformer import JsonTransformer, XmlTransformer
from transmogrifier.sources.xml.datacite import Datacite


@pytest.fixture(autouse=True)
Expand All @@ -29,7 +29,7 @@ def bad_config():
SOURCES["bad-class-name"] = {
"name": "Some Repository",
"base-url": "https://example.com/",
"transform-class": "transmogrifier.sources.datacite.WrongClass",
"transform-class": "transmogrifier.sources.xml.datacite.WrongClass",
}
SOURCES["bad-module-path"] = {
"name": "Some Repository",
Expand All @@ -46,6 +46,15 @@ def runner():
return CliRunner()


@pytest.fixture
def aardvark_record_all_fields():
return next(
JsonTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)
)


@pytest.fixture()
def datacite_records():
return XmlTransformer.parse_source_file(
Expand All @@ -61,6 +70,11 @@ def datacite_record_all_fields():
return Datacite("cool-repo", source_records)


@pytest.fixture()
def aardvark_records():
return JsonTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl")


@pytest.fixture()
def loc_country_crosswalk():
return load_external_config("config/loc-countries.xml", "xml")
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
2 changes: 2 additions & 0 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "123", "dct_title_s": "Test title 1"}
{"id": "456", "dct_title_s": "Test title 2"}
43 changes: 43 additions & 0 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import MITAardvark


def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert transformer.get_required_fields(next(aardvark_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/123",
"timdex_record_id": "cool-repo:123",
"title": "Test title 1",
}


def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/123",
timdex_record_id="cool-repo:123",
title="Test title 1",
citation="Test title 1. Geospatial data. https://example.com/123",
content_type=["Geospatial data"],
)


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123"


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert MITAardvark.get_subjects(aardvark_record_all_fields) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
timdex.Subject(value=["Earth"], kind="Dublin Core Subject"),
timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
]
11 changes: 6 additions & 5 deletions tests/test_transformer.py → tests/sources/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from pathlib import Path
from unittest.mock import patch

import pytest

from transmogrifier.models import TimdexRecord
from transmogrifier.sources.datacite import Datacite
from transmogrifier.sources.transformer import Transformer, XmlTransformer
from transmogrifier.sources.xml.datacite import Datacite


def test_transformer_get_transformer_returns_correct_class_name():
Expand Down Expand Up @@ -62,11 +63,11 @@ def test_xmltransformer_transform_and_write_output_files_writes_output_files(
):
output_file = str(tmp_path / "output_file.json")
transformer = XmlTransformer("cool-repo", oai_pmh_records)
assert not Path(tmp_path / "output_file.json").exists()
assert not Path(tmp_path / "output_file.txt").exists()
transformer.transform_and_write_output_files(output_file)
output_files = list(tmp_path.iterdir())
assert len(output_files) == 2
assert output_files[0].name == "output_file.json"
assert output_files[1].name == "output_file.txt"
assert Path(tmp_path / "output_file.json").exists()
assert Path(tmp_path / "output_file.txt").exists()


def test_xmltransformer_transform_and_write_output_files_no_txt_file_if_not_needed(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
Subject,
TimdexRecord,
)
from transmogrifier.sources.datacite import Datacite
from transmogrifier.sources.xml.datacite import Datacite


def test_datacite_transform_with_all_fields_transforms_correctly(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import transmogrifier.models as timdex
from transmogrifier.sources.dspace_dim import DspaceDim
from transmogrifier.sources.xml.dspace_dim import DspaceDim


def test_dspace_dim_transform_with_all_fields_transforms_correctly():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import transmogrifier.models as timdex
from transmogrifier.sources.dspace_mets import DspaceMets
from transmogrifier.sources.xml.dspace_mets import DspaceMets


def test_dspace_mets_transform_with_missing_optional_fields_transforms_correctly():
Expand Down
6 changes: 3 additions & 3 deletions tests/test_ead.py → tests/sources/xml/test_ead.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

import transmogrifier.models as timdex
from transmogrifier.sources.ead import Ead
from transmogrifier.sources.xml.ead import Ead


def test_ead_record_all_fields_transform_correctly():
Expand Down Expand Up @@ -224,7 +224,7 @@ def test_ead_record_with_missing_archdesc_logs_error(caplog):
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
assert (
"transmogrifier.sources.ead",
"transmogrifier.sources.xml.ead",
logging.ERROR,
"Record ID repositories/2/resources/4 is missing archdesc element",
) in caplog.record_tuples
Expand All @@ -238,7 +238,7 @@ def test_ead_record_with_missing_archdesc_did_logs_error(caplog):
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
assert (
"transmogrifier.sources.ead",
"transmogrifier.sources.xml.ead",
logging.ERROR,
"Record ID repositories/2/resources/3 is missing archdesc > did element",
) in caplog.record_tuples
Expand Down
6 changes: 3 additions & 3 deletions tests/test_marc.py → tests/sources/xml/test_marc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from bs4 import BeautifulSoup

import transmogrifier.models as timdex
from transmogrifier.sources.marc import Marc
from transmogrifier.sources.xml.marc import Marc


def test_marc_record_all_fields_transform_correctly():
Expand Down Expand Up @@ -752,7 +752,7 @@ def test_marc_record_missing_leader_logs_error(caplog):
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
assert (
"transmogrifier.sources.marc",
"transmogrifier.sources.xml.marc",
logging.ERROR,
"Record ID 990027185640106761 is missing MARC leader",
) in caplog.record_tuples
Expand All @@ -766,7 +766,7 @@ def test_marc_record_missing_008_logs_error(caplog):
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
assert (
"transmogrifier.sources.marc",
"transmogrifier.sources.xml.marc",
logging.ERROR,
"Record ID 990027185640106761 is missing MARC 008 field",
) in caplog.record_tuples
Expand Down
2 changes: 1 addition & 1 deletion tests/test_oai_dc.py → tests/sources/xml/test_oai_dc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import transmogrifier.models as timdex
from transmogrifier.sources.oaidc import OaiDc
from transmogrifier.sources.xml.oaidc import OaiDc

FIXTURES_PREFIX = "tests/fixtures/oai_dc"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import transmogrifier.models as timdex
from transmogrifier.sources.springshare import SpringshareOaiDc
from transmogrifier.sources.xml.springshare import SpringshareOaiDc

SPRINGSHARE_FIXTURES_PREFIX = "tests/fixtures/oai_dc/springshare"

Expand Down
2 changes: 1 addition & 1 deletion tests/test_whoas.py → tests/sources/xml/test_whoas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from transmogrifier.sources.whoas import Whoas
from transmogrifier.sources.xml.whoas import Whoas


def test_valid_content_types_with_all_invalid():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_zenodo.py → tests/sources/xml/test_zenodo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from transmogrifier.sources.zenodo import Zenodo
from transmogrifier.sources.xml.zenodo import Zenodo


def test_zenodo_create_source_record_id_generates_correct_id():
Expand Down
16 changes: 8 additions & 8 deletions transmogrifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,42 +81,42 @@
"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?"
"vid=01MIT_INST:MIT&docid=alma"
),
"transform-class": "transmogrifier.sources.marc.Marc",
"transform-class": "transmogrifier.sources.xml.marc.Marc",
},
"aspace": {
"name": "MIT ArchivesSpace",
"base-url": "https://archivesspace.mit.edu/",
"transform-class": "transmogrifier.sources.ead.Ead",
"transform-class": "transmogrifier.sources.xml.ead.Ead",
},
"dspace": {
"name": "DSpace@MIT",
"base-url": "https://dspace.mit.edu/handle/",
"transform-class": "transmogrifier.sources.dspace_mets.DspaceMets",
"transform-class": "transmogrifier.sources.xml.dspace_mets.DspaceMets",
},
"jpal": {
"name": "Abdul Latif Jameel Poverty Action Lab Dataverse",
"base-url": "https://dataverse.harvard.edu/dataset.xhtml?persistentId=",
"transform-class": "transmogrifier.sources.datacite.Datacite",
"transform-class": "transmogrifier.sources.xml.datacite.Datacite",
},
"libguides": {
"name": "LibGuides",
"base-url": "https://libguides.mit.edu/",
"transform-class": "transmogrifier.sources.springshare.SpringshareOaiDc",
"transform-class": "transmogrifier.sources.xml.springshare.SpringshareOaiDc",
},
"researchdatabases": {
"name": "Research Databases",
"base-url": "https://libguides.mit.edu/",
"transform-class": "transmogrifier.sources.springshare.SpringshareOaiDc",
"transform-class": "transmogrifier.sources.xml.springshare.SpringshareOaiDc",
},
"whoas": {
"name": "Woods Hole Open Access Server",
"base-url": "https://darchive.mblwhoilibrary.org/handle/",
"transform-class": "transmogrifier.sources.whoas.Whoas",
"transform-class": "transmogrifier.sources.xml.whoas.Whoas",
},
"zenodo": {
"name": "Zenodo",
"base-url": "https://zenodo.org/record/",
"transform-class": "transmogrifier.sources.zenodo.Zenodo",
"transform-class": "transmogrifier.sources.xml.zenodo.Zenodo",
},
}

Expand Down
Empty file.
Loading

0 comments on commit 1aba987

Please sign in to comment.