Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdt 54 aardvark transform #108

Merged
merged 5 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ name = "pypi"
attrs = "*"
beautifulsoup4 = "*"
click = "*"
jsonlines = "*"
lxml = "*"
python-dateutil = "*"
sentry-sdk = "*"
smart-open = {version = "*", extras = ["s3"]}
python-dateutil = "*"
types-python-dateutil = "*"

[dev-packages]
Expand Down
568 changes: 461 additions & 107 deletions Pipfile.lock

Large diffs are not rendered by default.

20 changes: 17 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import transmogrifier.models as timdex
from transmogrifier.config import SOURCES, load_external_config
from transmogrifier.sources.datacite import Datacite
from transmogrifier.sources.transformer import XmlTransformer
from transmogrifier.sources.transformer import JsonTransformer, XmlTransformer
from transmogrifier.sources.xml.datacite import Datacite


@pytest.fixture(autouse=True)
Expand All @@ -29,7 +29,7 @@ def bad_config():
SOURCES["bad-class-name"] = {
"name": "Some Repository",
"base-url": "https://example.com/",
"transform-class": "transmogrifier.sources.datacite.WrongClass",
"transform-class": "transmogrifier.sources.xml.datacite.WrongClass",
}
SOURCES["bad-module-path"] = {
"name": "Some Repository",
Expand All @@ -46,6 +46,15 @@ def runner():
return CliRunner()


@pytest.fixture
def aardvark_record_all_fields():
return next(
JsonTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)
)


@pytest.fixture()
def datacite_records():
return XmlTransformer.parse_source_file(
Expand All @@ -61,6 +70,11 @@ def datacite_record_all_fields():
return Datacite("cool-repo", source_records)


@pytest.fixture()
def json_records():
return JsonTransformer.parse_source_file("tests/fixtures/json_records.jsonl")


@pytest.fixture()
def loc_country_crosswalk():
return load_external_config("config/loc-countries.xml", "xml")
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
2 changes: 2 additions & 0 deletions tests/fixtures/json_records.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "123"}
{"id": "456"}
43 changes: 43 additions & 0 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import Aardvark


def test_aardvark_get_required_fields_returns_expected_values(json_records):
transformer = Aardvark("cool-repo", json_records)
assert transformer.get_required_fields(next(json_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/123",
"timdex_record_id": "cool-repo:123",
"title": "Title not provided",
}


def test_jsontransformer_transform_returns_timdex_record(json_records):
transformer = Aardvark("cool-repo", json_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/123",
timdex_record_id="cool-repo:123",
title="Title not provided",
citation="Title not provided. Geospatial data. https://example.com/123",
content_type=["Geospatial data"],
)


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert Aardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert Aardvark.get_source_record_id(aardvark_record_all_fields) == "123"


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert Aardvark.get_subjects(aardvark_record_all_fields) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
timdex.Subject(value=["Earth"], kind="Dublin Core Subject"),
timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
]
11 changes: 6 additions & 5 deletions tests/test_transformer.py → tests/sources/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from pathlib import Path
from unittest.mock import patch

import pytest

from transmogrifier.models import TimdexRecord
from transmogrifier.sources.datacite import Datacite
from transmogrifier.sources.transformer import Transformer, XmlTransformer
from transmogrifier.sources.xml.datacite import Datacite


def test_transformer_get_transformer_returns_correct_class_name():
Expand Down Expand Up @@ -62,11 +63,11 @@ def test_xmltransformer_transform_and_write_output_files_writes_output_files(
):
output_file = str(tmp_path / "output_file.json")
transformer = XmlTransformer("cool-repo", oai_pmh_records)
assert not Path(tmp_path / "output_file.json").exists()
assert not Path(tmp_path / "output_file.txt").exists()
transformer.transform_and_write_output_files(output_file)
output_files = list(tmp_path.iterdir())
assert len(output_files) == 2
assert output_files[0].name == "output_file.json"
assert output_files[1].name == "output_file.txt"
assert Path(tmp_path / "output_file.json").exists()
assert Path(tmp_path / "output_file.txt").exists()


def test_xmltransformer_transform_and_write_output_files_no_txt_file_if_not_needed(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
Subject,
TimdexRecord,
)
from transmogrifier.sources.datacite import Datacite
from transmogrifier.sources.xml.datacite import Datacite


def test_datacite_transform_with_all_fields_transforms_correctly(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import transmogrifier.models as timdex
from transmogrifier.sources.dspace_dim import DspaceDim
from transmogrifier.sources.xml.dspace_dim import DspaceDim


def test_dspace_dim_transform_with_all_fields_transforms_correctly():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import transmogrifier.models as timdex
from transmogrifier.sources.dspace_mets import DspaceMets
from transmogrifier.sources.xml.dspace_mets import DspaceMets


def test_dspace_mets_transform_with_missing_optional_fields_transforms_correctly():
Expand Down
6 changes: 3 additions & 3 deletions tests/test_ead.py → tests/sources/xml/test_ead.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

import transmogrifier.models as timdex
from transmogrifier.sources.ead import Ead
from transmogrifier.sources.xml.ead import Ead


def test_ead_record_all_fields_transform_correctly():
Expand Down Expand Up @@ -224,7 +224,7 @@ def test_ead_record_with_missing_archdesc_logs_error(caplog):
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
assert (
"transmogrifier.sources.ead",
"transmogrifier.sources.xml.ead",
logging.ERROR,
"Record ID repositories/2/resources/4 is missing archdesc element",
) in caplog.record_tuples
Expand All @@ -238,7 +238,7 @@ def test_ead_record_with_missing_archdesc_did_logs_error(caplog):
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
assert (
"transmogrifier.sources.ead",
"transmogrifier.sources.xml.ead",
logging.ERROR,
"Record ID repositories/2/resources/3 is missing archdesc > did element",
) in caplog.record_tuples
Expand Down
6 changes: 3 additions & 3 deletions tests/test_marc.py → tests/sources/xml/test_marc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from bs4 import BeautifulSoup

import transmogrifier.models as timdex
from transmogrifier.sources.marc import Marc
from transmogrifier.sources.xml.marc import Marc


def test_marc_record_all_fields_transform_correctly():
Expand Down Expand Up @@ -752,7 +752,7 @@ def test_marc_record_missing_leader_logs_error(caplog):
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
assert (
"transmogrifier.sources.marc",
"transmogrifier.sources.xml.marc",
logging.ERROR,
"Record ID 990027185640106761 is missing MARC leader",
) in caplog.record_tuples
Expand All @@ -766,7 +766,7 @@ def test_marc_record_missing_008_logs_error(caplog):
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
assert (
"transmogrifier.sources.marc",
"transmogrifier.sources.xml.marc",
logging.ERROR,
"Record ID 990027185640106761 is missing MARC 008 field",
) in caplog.record_tuples
Expand Down
2 changes: 1 addition & 1 deletion tests/test_oai_dc.py → tests/sources/xml/test_oai_dc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import transmogrifier.models as timdex
from transmogrifier.sources.oaidc import OaiDc
from transmogrifier.sources.xml.oaidc import OaiDc

FIXTURES_PREFIX = "tests/fixtures/oai_dc"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import transmogrifier.models as timdex
from transmogrifier.sources.springshare import SpringshareOaiDc
from transmogrifier.sources.xml.springshare import SpringshareOaiDc

SPRINGSHARE_FIXTURES_PREFIX = "tests/fixtures/oai_dc/springshare"

Expand Down
2 changes: 1 addition & 1 deletion tests/test_whoas.py → tests/sources/xml/test_whoas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from transmogrifier.sources.whoas import Whoas
from transmogrifier.sources.xml.whoas import Whoas


def test_valid_content_types_with_all_invalid():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_zenodo.py → tests/sources/xml/test_zenodo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from transmogrifier.sources.zenodo import Zenodo
from transmogrifier.sources.xml.zenodo import Zenodo


def test_zenodo_create_source_record_id_generates_correct_id():
Expand Down
16 changes: 8 additions & 8 deletions transmogrifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,42 +81,42 @@
"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?"
"vid=01MIT_INST:MIT&docid=alma"
),
"transform-class": "transmogrifier.sources.marc.Marc",
"transform-class": "transmogrifier.sources.xml.marc.Marc",
},
"aspace": {
"name": "MIT ArchivesSpace",
"base-url": "https://archivesspace.mit.edu/",
"transform-class": "transmogrifier.sources.ead.Ead",
"transform-class": "transmogrifier.sources.xml.ead.Ead",
},
"dspace": {
"name": "DSpace@MIT",
"base-url": "https://dspace.mit.edu/handle/",
"transform-class": "transmogrifier.sources.dspace_mets.DspaceMets",
"transform-class": "transmogrifier.sources.xml.dspace_mets.DspaceMets",
},
"jpal": {
"name": "Abdul Latif Jameel Poverty Action Lab Dataverse",
"base-url": "https://dataverse.harvard.edu/dataset.xhtml?persistentId=",
"transform-class": "transmogrifier.sources.datacite.Datacite",
"transform-class": "transmogrifier.sources.xml.datacite.Datacite",
},
"libguides": {
"name": "LibGuides",
"base-url": "https://libguides.mit.edu/",
"transform-class": "transmogrifier.sources.springshare.SpringshareOaiDc",
"transform-class": "transmogrifier.sources.xml.springshare.SpringshareOaiDc",
},
"researchdatabases": {
"name": "Research Databases",
"base-url": "https://libguides.mit.edu/",
"transform-class": "transmogrifier.sources.springshare.SpringshareOaiDc",
"transform-class": "transmogrifier.sources.xml.springshare.SpringshareOaiDc",
},
"whoas": {
"name": "Woods Hole Open Access Server",
"base-url": "https://darchive.mblwhoilibrary.org/handle/",
"transform-class": "transmogrifier.sources.whoas.Whoas",
"transform-class": "transmogrifier.sources.xml.whoas.Whoas",
},
"zenodo": {
"name": "Zenodo",
"base-url": "https://zenodo.org/record/",
"transform-class": "transmogrifier.sources.zenodo.Zenodo",
"transform-class": "transmogrifier.sources.xml.zenodo.Zenodo",
},
}

Expand Down
Empty file.
Loading
Loading