diff --git a/Pipfile.lock b/Pipfile.lock
index afd4e68..f8cf30a 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -691,41 +691,36 @@
},
"cryptography": {
"hashes": [
- "sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad",
- "sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583",
- "sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b",
- "sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c",
- "sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1",
- "sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648",
- "sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949",
- "sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba",
- "sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c",
- "sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9",
- "sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d",
- "sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c",
- "sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e",
- "sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2",
- "sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d",
- "sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7",
- "sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70",
- "sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2",
- "sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7",
- "sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14",
- "sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe",
- "sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e",
- "sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71",
- "sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961",
- "sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7",
- "sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c",
- "sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28",
- "sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842",
- "sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902",
- "sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801",
- "sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a",
- "sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e"
+ "sha256:0663585d02f76929792470451a5ba64424acc3cd5227b03921dab0e2f27b1709",
+ "sha256:08a24a7070b2b6804c1940ff0f910ff728932a9d0e80e7814234269f9d46d069",
+ "sha256:232ce02943a579095a339ac4b390fbbe97f5b5d5d107f8a08260ea2768be8cc2",
+ "sha256:2905ccf93a8a2a416f3ec01b1a7911c3fe4073ef35640e7ee5296754e30b762b",
+ "sha256:299d3da8e00b7e2b54bb02ef58d73cd5f55fb31f33ebbf33bd00d9aa6807df7e",
+ "sha256:2c6d112bf61c5ef44042c253e4859b3cbbb50df2f78fa8fae6747a7814484a70",
+ "sha256:31e44a986ceccec3d0498e16f3d27b2ee5fdf69ce2ab89b52eaad1d2f33d8778",
+ "sha256:3d9a1eca329405219b605fac09ecfc09ac09e595d6def650a437523fcd08dd22",
+ "sha256:3dcdedae5c7710b9f97ac6bba7e1052b95c7083c9d0e9df96e02a1932e777895",
+ "sha256:47ca71115e545954e6c1d207dd13461ab81f4eccfcb1345eac874828b5e3eaaf",
+ "sha256:4a997df8c1c2aae1e1e5ac49c2e4f610ad037fc5a3aadc7b64e39dea42249431",
+ "sha256:51956cf8730665e2bdf8ddb8da0056f699c1a5715648c1b0144670c1ba00b48f",
+ "sha256:5bcb8a5620008a8034d39bce21dc3e23735dfdb6a33a06974739bfa04f853947",
+ "sha256:64c3f16e2a4fc51c0d06af28441881f98c5d91009b8caaff40cf3548089e9c74",
+ "sha256:6e2b11c55d260d03a8cf29ac9b5e0608d35f08077d8c087be96287f43af3ccdc",
+ "sha256:7b3f5fe74a5ca32d4d0f302ffe6680fcc5c28f8ef0dc0ae8f40c0f3a1b4fca66",
+ "sha256:844b6d608374e7d08f4f6e6f9f7b951f9256db41421917dfb2d003dde4cd6b66",
+ "sha256:9a8d6802e0825767476f62aafed40532bd435e8a5f7d23bd8b4f5fd04cc80ecf",
+ "sha256:aae4d918f6b180a8ab8bf6511a419473d107df4dbb4225c7b48c5c9602c38c7f",
+ "sha256:ac1955ce000cb29ab40def14fd1bbfa7af2017cca696ee696925615cafd0dce5",
+ "sha256:b88075ada2d51aa9f18283532c9f60e72170041bba88d7f37e49cbb10275299e",
+ "sha256:cb013933d4c127349b3948aa8aaf2f12c0353ad0eccd715ca789c8a0f671646f",
+ "sha256:cc70b4b581f28d0a254d006f26949245e3657d40d8857066c2ae22a61222ef55",
+ "sha256:e9c5266c432a1e23738d178e51c2c7a5e2ddf790f248be939448c0ba2021f9d1",
+ "sha256:ea9e57f8ea880eeea38ab5abf9fbe39f923544d7884228ec67d666abd60f5a47",
+ "sha256:ee0c405832ade84d4de74b9029bedb7b31200600fa524d218fc29bfa371e97f5",
+ "sha256:fdcb265de28585de5b859ae13e3846a8e805268a823a12a4da2597f1f5afc9f0"
],
"markers": "python_version >= '3.7'",
- "version": "==42.0.8"
+ "version": "==43.0.0"
},
"decorator": {
"hashes": [
@@ -921,8 +916,33 @@
},
"mypy": {
"hashes": [
+ "sha256:0bea2a0e71c2a375c9fa0ede3d98324214d67b3cbbfcbd55ac8f750f85a414e3",
+ "sha256:104e9c1620c2675420abd1f6c44bab7dd33cc85aea751c985006e83dcd001095",
+ "sha256:14f9294528b5f5cf96c721f231c9f5b2733164e02c1c018ed1a0eff8a18005ac",
+ "sha256:1a5d8d8dd8613a3e2be3eae829ee891b6b2de6302f24766ff06cb2875f5be9c6",
+ "sha256:1d44c1e44a8be986b54b09f15f2c1a66368eb43861b4e82573026e04c48a9e20",
"sha256:25bcfa75b9b5a5f8d67147a54ea97ed63a653995a82798221cca2a315c0238c1",
- "sha256:56913ec8c7638b0091ef4da6fcc9136896914a9d60d54670a75880c3e5b99ace"
+ "sha256:35ce88b8ed3a759634cb4eb646d002c4cef0a38f20565ee82b5023558eb90c00",
+ "sha256:56913ec8c7638b0091ef4da6fcc9136896914a9d60d54670a75880c3e5b99ace",
+ "sha256:65f190a6349dec29c8d1a1cd4aa71284177aee5949e0502e6379b42873eddbe7",
+ "sha256:6801319fe76c3f3a3833f2b5af7bd2c17bb93c00026a2a1b924e6762f5b19e13",
+ "sha256:72596a79bbfb195fd41405cffa18210af3811beb91ff946dbcb7368240eed6be",
+ "sha256:93743608c7348772fdc717af4aeee1997293a1ad04bc0ea6efa15bf65385c538",
+ "sha256:940bfff7283c267ae6522ef926a7887305945f716a7704d3344d6d07f02df850",
+ "sha256:96f8dbc2c85046c81bcddc246232d500ad729cb720da4e20fce3b542cab91287",
+ "sha256:98790025861cb2c3db8c2f5ad10fc8c336ed2a55f4daf1b8b3f877826b6ff2eb",
+ "sha256:a3824187c99b893f90c845bab405a585d1ced4ff55421fdf5c84cb7710995229",
+ "sha256:a83ec98ae12d51c252be61521aa5731f5512231d0b738b4cb2498344f0b840cd",
+ "sha256:becc9111ca572b04e7e77131bc708480cc88a911adf3d0239f974c034b78085c",
+ "sha256:c1a184c64521dc549324ec6ef7cbaa6b351912be9cb5edb803c2808a0d7e85ac",
+ "sha256:c7b73a856522417beb78e0fb6d33ef89474e7a622db2653bc1285af36e2e3e3d",
+ "sha256:cea3d0fb69637944dd321f41bc896e11d0fb0b0aa531d887a6da70f6e7473aba",
+ "sha256:d2b3d36baac48e40e3064d2901f2fbd2a2d6880ec6ce6358825c85031d7c0d4d",
+ "sha256:d7b54c27783991399046837df5c7c9d325d921394757d09dbcbf96aee4649fe9",
+ "sha256:d8e2e43977f0e09f149ea69fd0556623919f816764e26d74da0c8a7b48f3e18a",
+ "sha256:dbe286303241fea8c2ea5466f6e0e6a046a135a7e7609167b07fd4e7baf151bf",
+ "sha256:f006e955718ecd8d159cee9932b64fba8f86ee6f7728ca3ac66c3a54b0062abe",
+ "sha256:f2268d9fcd9686b61ab64f077be7ffbc6fbcdfb4103e5dd0cc5eaab53a8886c2"
],
"index": "pypi",
"markers": "python_version >= '3.8'",
@@ -1018,10 +1038,10 @@
},
"pure-eval": {
"hashes": [
- "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350",
- "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"
+ "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0",
+ "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"
],
- "version": "==0.2.2"
+ "version": "==0.2.3"
},
"pycparser": {
"hashes": [
@@ -1144,12 +1164,12 @@
},
"pytest": {
"hashes": [
- "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343",
- "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"
+ "sha256:7e8e5c5abd6e93cb1cc151f23e57adc31fcf8cfd2a3ff2da63e23f732de35db6",
+ "sha256:e9600ccf4f563976e2c99fa02c7624ab938296551f280835ee6516df8bc4ae8c"
],
"index": "pypi",
"markers": "python_version >= '3.8'",
- "version": "==8.2.2"
+ "version": "==8.3.1"
},
"pyyaml": {
"hashes": [
@@ -1290,28 +1310,28 @@
},
"ruff": {
"hashes": [
- "sha256:03bfe9ab5bdc0b08470c3b261643ad54ea86edc32b64d1e080892d7953add3ad",
- "sha256:05fbd2cb404775d6cd7f2ff49504e2d20e13ef95fa203bd1ab22413af70d420b",
- "sha256:08058d077e21b856d32ebf483443390e29dc44d927608dc8f092ff6776519da9",
- "sha256:2a3eb4f1841771fa5b67a56be9c2d16fd3cc88e378bd86aaeaec2f7e6bcdd0a2",
- "sha256:642cbff6cbfa38d2566d8db086508d6f472edb136cbfcc4ea65997745368c29e",
- "sha256:76bb5a87fd397520b91a83eae8a2f7985236d42dd9459f09eef58e7f5c1d8316",
- "sha256:7704582a026fa02cca83efd76671a98ee6eb412c4230209efe5e2a006c06db62",
- "sha256:77d49484429ed7c7e6e2e75a753f153b7b58f875bdb4158ad85af166a1ec1822",
- "sha256:96066c4328a49fce2dd40e80f7117987369feec30ab771516cf95f1cc2db923c",
- "sha256:a8cfc7a26422c78e94f1ec78ec02501bbad2df5834907e75afe474cc6b83a8c1",
- "sha256:b12424d9db7347fa63c5ed9af010003338c63c629fb9c9c6adb2aa4f5699729b",
- "sha256:b8d72c5684bbd4ed304a9a955ee2e67f57b35f6193222ade910cca8a805490e3",
- "sha256:bc697ec874fdd7c7ba0a85ec76ab38f8595224868d67f097c5ffc21136e72fcd",
- "sha256:cbaec2ddf4f78e5e9ecf5456ea0f496991358a1d883862ed0b9e947e2b6aea93",
- "sha256:cf4bc751240b2fab5d19254571bcacb315c7b0b00bf3c912d52226a82bbec073",
- "sha256:d2fc2cdb85ccac1e816cc9d5d8cedefd93661bd957756d902543af32a6b04a71",
- "sha256:e791d34d3557a3819b3704bc1f087293c821083fa206812842fa363f6018a192",
- "sha256:eafc45dd8bdc37a00b28e68cc038daf3ca8c233d73fea276dcd09defb1352841"
+ "sha256:029454e2824eafa25b9df46882f7f7844d36fd8ce51c1b7f6d97e2615a57bbcc",
+ "sha256:09c14ed6a72af9ccc8d2e313d7acf7037f0faff43cde4b507e66f14e812e37f7",
+ "sha256:0cf497a47751be8c883059c4613ba2f50dd06ec672692de2811f039432875278",
+ "sha256:2795726d5f71c4f4e70653273d1c23a8182f07dd8e48c12de5d867bfb7557eed",
+ "sha256:3520a00c0563d7a7a7c324ad7e2cde2355733dafa9592c671fb2e9e3cd8194c1",
+ "sha256:4c55efbecc3152d614cfe6c2247a3054cfe358cefbf794f8c79c8575456efe19",
+ "sha256:58b54459221fd3f661a7329f177f091eb35cf7a603f01d9eb3eb11cc348d38c4",
+ "sha256:628f6b8f97b8bad2490240aa84f3e68f390e13fabc9af5c0d3b96b485921cd60",
+ "sha256:768fa9208df2bec4b2ce61dbc7c2ddd6b1be9fb48f1f8d3b78b3332c7d71c1ff",
+ "sha256:82acef724fc639699b4d3177ed5cc14c2a5aacd92edd578a9e846d5b5ec18ddf",
+ "sha256:93789f14ca2244fb91ed481456f6d0bb8af1f75a330e133b67d08f06ad85b516",
+ "sha256:9492320eed573a13a0bc09a2957f17aa733fff9ce5bf00e66e6d4a88ec33813f",
+ "sha256:a6e1f62a92c645e2919b65c02e79d1f61e78a58eddaebca6c23659e7c7cb4ac7",
+ "sha256:bd53da65f1085fb5b307c38fd3c0829e76acf7b2a912d8d79cadcdb4875c1eb7",
+ "sha256:da62e87637c8838b325e65beee485f71eb36202ce8e3cdbc24b9fcb8b99a37be",
+ "sha256:e1e7393e9c56128e870b233c82ceb42164966f25b30f68acbb24ed69ce9c3a4e",
+ "sha256:e98ad088edfe2f3b85a925ee96da652028f093d6b9b56b76fc242d8abb8e2059",
+ "sha256:f9b85eaa1f653abd0a70603b8b7008d9e00c9fa1bbd0bf40dad3f0c0bdd06793"
],
"index": "pypi",
"markers": "python_version >= '3.7'",
- "version": "==0.5.3"
+ "version": "==0.5.4"
},
"safety": {
"hashes": [
@@ -1332,11 +1352,11 @@
},
"setuptools": {
"hashes": [
- "sha256:3d8531791a27056f4a38cd3e54084d8b1c4228ff9cf3f2d7dd075ec99f9fd70d",
- "sha256:f501b6e6db709818dc76882582d9c516bf3b67b948864c5fa1d1624c09a49207"
+ "sha256:032d42ee9fb536e33087fb66cac5f840eb9391ed05637b3f2a76a7c8fb477936",
+ "sha256:33874fdc59b3188304b2e7c80d9029097ea31627180896fb549c578ceb8a0855"
],
"markers": "python_version >= '3.8'",
- "version": "==71.0.3"
+ "version": "==71.1.0"
},
"shellingham": {
"hashes": [
@@ -1409,4 +1429,4 @@
"version": "==0.2.13"
}
}
-}
+}
\ No newline at end of file
diff --git a/tests/sources/xml/test_marc.py b/tests/sources/xml/test_marc.py
index a6913a9..6651903 100644
--- a/tests/sources/xml/test_marc.py
+++ b/tests/sources/xml/test_marc.py
@@ -1,11 +1,56 @@
+# ruff: noqa: E501, SLF001
import logging
+import pytest
from bs4 import BeautifulSoup # type: ignore[import-untyped]
import transmogrifier.models as timdex
+from transmogrifier.exceptions import SkippedRecordEvent
from transmogrifier.sources.xml.marc import Marc
+def create_marc_source_record_stub(
+ leader_field_insert: str = "03282nam 2200721Ki 4500",
+ control_field_insert: str = (
+ '170906s2016 fr mun| o e zxx d'
+ ),
+ datafield_insert: str = "",
+):
+ """
+ Create source record for unit tests.
+
+ Args:
+ leader_field_insert (str): A string representing a MARC fixed length 'leader'
+ XML element. Defaults to a dummy value.
+ control_field_insert (str): A string representing a MARC fixed length
+ 'general info control field' (i.e., code 008) XML element.
+ Defaults to a dummy value.
+ datafield_insert (str): A string representing a MARC 'datafield' XML element.
+
+ Note: A source record for "missing" field method tests can be created by
+ setting datafield_insert = "" (the default).
+ """
+ xml_string = """
+
+
+ {leader_field_insert}
+ {control_field_insert}
+ 990027185640106761
+ {datafield_insert}
+
+
+ """
+
+ return BeautifulSoup(
+ xml_string.format(
+ leader_field_insert=leader_field_insert,
+ control_field_insert=control_field_insert,
+ datafield_insert=datafield_insert,
+ ),
+ "xml",
+ )
+
+
def test_marc_record_all_fields_transform_correctly():
marc_xml_records = Marc.parse_source_file(
"tests/fixtures/marc/marc_record_all_fields.xml"
@@ -748,32 +793,192 @@ def test_marc_record_with_missing_optional_fields_transforms_correctly():
)
-def test_marc_record_missing_leader_logs_error(caplog):
+def test_get_leader_field_success():
+ source_record = create_marc_source_record_stub()
+ assert Marc._get_leader_field(source_record) == "03282nam 2200721Ki 4500"
+
+
+def test_get_leader_field_raises_skipped_record_event_if_field_blank():
+ source_record = create_marc_source_record_stub(
+ leader_field_insert=""
+ )
+ with pytest.raises(
+ SkippedRecordEvent,
+ match=("Record skipped because key information is missing: ."),
+ ):
+ Marc._get_leader_field(source_record)
+
+
+def test_get_leader_field_raises_skipped_record_event_if_field_missing():
+ source_record = create_marc_source_record_stub(leader_field_insert="")
+ with pytest.raises(
+ SkippedRecordEvent,
+ match=("Record skipped because key information is missing: ."),
+ ):
+ Marc._get_leader_field(source_record)
+
+
+def test_get_control_field_success():
+ source_record = create_marc_source_record_stub()
+ assert Marc._get_control_field(source_record) == (
+ "170906s2016 fr mun| o e zxx d"
+ )
+
+
+def test_get_control_field_raises_skipped_record_event_if_field_blank():
+ source_record = create_marc_source_record_stub(
+ control_field_insert=''
+ )
+ with pytest.raises(
+ SkippedRecordEvent,
+ match=(
+ 'Record skipped because key information is missing: .'
+ ),
+ ):
+ Marc._get_control_field(source_record)
+
+
+def test_get_control_field_raises_skipped_record_event_if_field_missing():
+ source_record = create_marc_source_record_stub(control_field_insert="")
+ with pytest.raises(
+ SkippedRecordEvent,
+ match=(
+ 'Record skipped because key information is missing: .'
+ ),
+ ):
+ Marc._get_control_field(source_record)
+
+
+def test_get_alternate_titles_success():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+ Main Entry
+ Date 1
+ Date 2
+
+
+ Uniform
+ Date 1
+ Date 2
+
+
+ Varying Form
+ Of Title 1.
+
+
+ Added Entry 2
+ Part 1
+ Part 2
+
+
+ Added Entry 1
+ Part 1
+ Part 2
+
+ """
+ )
+ )
+ assert Marc.get_alternate_titles(source_record) == [
+ timdex.AlternateTitle(value="Main Entry Date 1 Date 2", kind="Preferred Title"),
+ timdex.AlternateTitle(value="Uniform Date 1 Date 2", kind="Preferred Title"),
+ timdex.AlternateTitle(
+ value="Varying Form Of Title 1", kind="Varying Form of Title"
+ ),
+ timdex.AlternateTitle(
+ value="Added Entry 2 Part 1 Part 2", kind="Preferred Title"
+ ),
+ timdex.AlternateTitle(
+ value="Added Entry 1 Part 1 Part 2",
+ kind="Uncontrolled Related/Analytical Title",
+ ),
+ ]
+
+
+def test_get_alternate_titles_transforms_correctly_if_fields_blank():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+
+
+
+
+ """
+ )
+ )
+ assert Marc.get_alternate_titles(source_record) is None
+
+
+def test_get_alternate_titles_transforms_correctly_if_fields_missing():
+ source_record = create_marc_source_record_stub()
+ assert Marc.get_alternate_titles(source_record) is None
+
+
+def test_get_call_numbers_success():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+ MA123.4
+ LC Call Number 2
+
+
+ 123.45
+ Dewey Call Number 2
+
+
+ Dewey Call Number 3
+
+ """
+ )
+ )
+ assert Marc.get_call_numbers(source_record) == [
+ "MA123.4",
+ "LC Call Number 2",
+ "123.45",
+ "Dewey Call Number 2",
+ "Dewey Call Number 3",
+ ]
+
+
+def test_get_call_numbers_transforms_correctly_if_fields_blank():
+ source_record = create_marc_source_record_stub(
+ datafield_insert=(
+ """
+
+
+
+ """
+ )
+ )
+ assert Marc.get_call_numbers(source_record) is None
+
+
+def test_get_call_numbers_transforms_correctly_if_fields_missing():
+ source_record = create_marc_source_record_stub()
+ assert Marc.get_call_numbers(source_record) is None
+
+
+def test_marc_record_missing_leader_skips_record(caplog):
marc_xml_records = Marc.parse_source_file(
"tests/fixtures/marc/marc_record_missing_leader.xml"
)
output_records = Marc("alma", marc_xml_records)
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
- assert (
- "transmogrifier.sources.xml.marc",
- logging.ERROR,
- "Record ID 990027185640106761 is missing MARC leader",
- ) in caplog.record_tuples
+ assert output_records.skipped_record_count == 1
-def test_marc_record_missing_008_logs_error(caplog):
+def test_marc_record_missing_008_skips_record(caplog):
marc_xml_records = Marc.parse_source_file(
"tests/fixtures/marc/marc_record_missing_008.xml"
)
output_records = Marc("alma", marc_xml_records)
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 1
- assert (
- "transmogrifier.sources.xml.marc",
- logging.ERROR,
- "Record ID 990027185640106761 is missing MARC 008 field",
- ) in caplog.record_tuples
+ assert output_records.skipped_record_count == 1
def test_create_subfield_value_list_from_datafield_with_values():
diff --git a/transmogrifier/sources/xml/marc.py b/transmogrifier/sources/xml/marc.py
index 098fffc..4b22fae 100644
--- a/transmogrifier/sources/xml/marc.py
+++ b/transmogrifier/sources/xml/marc.py
@@ -4,137 +4,63 @@
import transmogrifier.models as timdex
from transmogrifier.config import load_external_config
+from transmogrifier.exceptions import SkippedRecordEvent
from transmogrifier.helpers import validate_date
from transmogrifier.sources.xmltransformer import XMLTransformer
logger = logging.getLogger(__name__)
-country_code_crosswalk = load_external_config("config/loc-countries.xml", "xml")
-
-holdings_collection_crosswalk = load_external_config(
- "config/holdings_collection_crosswalk.json", "json"
-)
-holdings_format_crosswalk = load_external_config(
- "config/holdings_format_crosswalk.json", "json"
-)
-holdings_location_crosswalk = load_external_config(
- "config/holdings_location_crosswalk.json", "json"
-)
-
-language_code_crosswalk = load_external_config("config/loc-languages.xml", "xml")
-
-marc_content_type_crosswalk = load_external_config(
- "config/marc_content_type_crosswalk.json", "json"
-)
-
-
class Marc(XMLTransformer):
"""Marc transformer."""
- def get_optional_fields(self, xml: Tag) -> dict | None:
+ country_code_crosswalk = load_external_config("config/loc-countries.xml", "xml")
+ holdings_collection_crosswalk = load_external_config(
+ "config/holdings_collection_crosswalk.json", "json"
+ )
+ holdings_format_crosswalk = load_external_config(
+ "config/holdings_format_crosswalk.json", "json"
+ )
+ holdings_location_crosswalk = load_external_config(
+ "config/holdings_location_crosswalk.json", "json"
+ )
+ language_code_crosswalk = load_external_config("config/loc-languages.xml", "xml")
+ marc_content_type_crosswalk = load_external_config(
+ "config/marc_content_type_crosswalk.json", "json"
+ )
+
+ def get_optional_fields(self, source_record: Tag) -> dict | None:
"""
Retrieve optional TIMDEX fields from a MARC XML record.
Overrides metaclass get_optional_fields() method.
Args:
- xml: A BeautifulSoup Tag representing a single MARC XML record.
+ source_record: A BeautifulSoup Tag representing a single MARC XML record.
"""
fields: dict = {}
- source_record_id = Marc.get_source_record_id(xml)
-
- fixed_length_data = xml.find("controlfield", tag="008", string=True)
- if fixed_length_data is None:
- message = f"Record ID {source_record_id} is missing MARC 008 field"
- logger.error(message)
- return None
+ source_record_id = self.get_source_record_id(source_record)
- leader = xml.find("leader", string=True)
- if leader is None:
- message = f"Record ID {source_record_id} is missing MARC leader"
- logger.error(message)
- return None
-
- # alternate_titles
- alternate_title_marc_fields = [
- {
- "tag": "130",
- "subfields": "adfghklmnoprst",
- "kind": "Preferred Title",
- },
- {
- "tag": "240",
- "subfields": "adfghklmnoprs",
- "kind": "Preferred Title",
- },
- {
- "tag": "246",
- "subfields": "abfghinp",
- "kind": "Varying Form of Title",
- },
- {
- "tag": "730",
- "subfields": "adfghiklmnoprst",
- "kind": "Preferred Title",
- },
- {
- "tag": "740",
- "subfields": "anp",
- "kind": "Uncontrolled Related/Analytical Title",
- },
- ]
- for alternate_title_marc_field in alternate_title_marc_fields:
- for datafield in xml.find_all(
- "datafield", tag=alternate_title_marc_field["tag"]
- ):
- if alternate_title_value := (
- self.create_subfield_value_string_from_datafield(
- datafield,
- alternate_title_marc_field["subfields"],
- " ",
- )
- ):
- fields.setdefault("alternate_titles", []).append(
- timdex.AlternateTitle(
- value=alternate_title_value.rstrip(" .,/"),
- kind=alternate_title_marc_field["kind"],
- )
- )
+ # alternate titles
+ fields["alternate_titles"] = self.get_alternate_titles(source_record)
# call_numbers
- call_number_marc_fields = [
- {
- "tag": "050",
- "subfields": "a",
- },
- {
- "tag": "082",
- "subfields": "a",
- },
- ]
- for call_number_marc_field in call_number_marc_fields:
- for datafield in xml.find_all("datafield", tag=call_number_marc_field["tag"]):
- for call_number_value in self.create_subfield_value_list_from_datafield(
- datafield,
- call_number_marc_field["subfields"],
- ):
- fields.setdefault("call_numbers", []).append(call_number_value)
+ fields["call_numbers"] = self.get_call_numbers(source_record)
# citation not used in MARC
# content_type
if content_type := Marc.json_crosswalk_code_to_name(
- str(leader.string)[6:7],
- marc_content_type_crosswalk,
+ self._get_leader_field(source_record)[6:7],
+ self.marc_content_type_crosswalk,
source_record_id,
"Leader/06",
):
fields["content_type"] = [content_type]
# contents
- for datafield in xml.find_all("datafield", tag="505"):
+ for datafield in source_record.find_all("datafield", tag="505"):
for contents_value in self.create_subfield_value_list_from_datafield(
datafield,
"agrt",
@@ -171,7 +97,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
]
contributor_values = []
for contributor_marc_field in contributor_marc_fields:
- for datafield in xml.find_all("datafield", tag=contributor_marc_field["tag"]):
+ for datafield in source_record.find_all(
+ "datafield", tag=contributor_marc_field["tag"]
+ ):
if contributor_value := (
self.create_subfield_value_string_from_datafield(
datafield,
@@ -203,7 +131,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
fields["contributors"] = contributor_values or None
# dates
- publication_year = str(fixed_length_data.string)[7:11].strip()
+ publication_year = self._get_control_field(source_record)[7:11].strip()
if validate_date(publication_year, source_record_id):
fields["dates"] = [
timdex.Date(kind="Publication date", value=publication_year)
@@ -211,7 +139,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# edition
edition_values = []
- for datafield in xml.find_all("datafield", tag="250"):
+ for datafield in source_record.find_all("datafield", tag="250"):
if edition_value := self.create_subfield_value_string_from_datafield(
datafield, "ab", " "
):
@@ -226,25 +154,25 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# holdings
# physical items
- for datafield in xml.find_all("datafield", tag="985"):
+ for datafield in source_record.find_all("datafield", tag="985"):
holding_call_number_value = self.create_subfield_value_string_from_datafield(
datafield, ["bb"]
)
holding_collection_value = Marc.json_crosswalk_code_to_name(
self.create_subfield_value_string_from_datafield(datafield, ["aa"]),
- holdings_collection_crosswalk,
+ self.holdings_collection_crosswalk,
source_record_id,
"985 $aa",
)
holding_format_value = Marc.json_crosswalk_code_to_name(
self.create_subfield_value_string_from_datafield(datafield, "t"),
- holdings_format_crosswalk,
+ self.holdings_format_crosswalk,
source_record_id,
"985 $t",
)
holding_location_value = Marc.json_crosswalk_code_to_name(
self.create_subfield_value_string_from_datafield(datafield, "i"),
- holdings_location_crosswalk,
+ self.holdings_location_crosswalk,
source_record_id,
"985 $i",
)
@@ -268,7 +196,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)
)
# electronic portfolio items
- for field_986 in xml.find_all("datafield", tag="986"):
+ for field_986 in source_record.find_all("datafield", tag="986"):
electronic_item_collection = self.get_single_subfield_string(field_986, "j")
electronic_item_location = (
self.get_single_subfield_string(field_986, "f")
@@ -330,7 +258,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
},
]
for identifier_marc_field in identifier_marc_fields:
- for datafield in xml.find_all("datafield", tag=identifier_marc_field["tag"]):
+ for datafield in source_record.find_all(
+ "datafield", tag=identifier_marc_field["tag"]
+ ):
if identifier_value := (
self.create_subfield_value_string_from_datafield(
datafield,
@@ -350,9 +280,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# Get language codes
language_codes = []
- if fixed_language_value := str(fixed_length_data.string)[35:38]:
+ if fixed_language_value := self._get_control_field(source_record)[35:38]:
language_codes.append(fixed_language_value)
- for field_041 in xml.find_all("datafield", tag="041"):
+ for field_041 in source_record.find_all("datafield", tag="041"):
language_codes.extend(
self.create_subfield_value_list_from_datafield(field_041, "abdefghjkmn")
)
@@ -360,12 +290,12 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# Crosswalk codes to names
for language_code in list(dict.fromkeys(language_codes)):
if language_name := Marc.loc_crosswalk_code_to_name(
- language_code, language_code_crosswalk, source_record_id, "language"
+ language_code, self.language_code_crosswalk, source_record_id, "language"
):
languages.append(language_name) # noqa: PERF401
# Add language notes
- for field_546 in xml.find_all("datafield", tag="546"):
+ for field_546 in source_record.find_all("datafield", tag="546"):
if language_note := field_546.find("subfield", code="a", string=True):
languages.append(str(language_note.string).rstrip(" .")) # noqa: PERF401
@@ -375,7 +305,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# If indicator 1 is 4 and indicator 2 is 0 or 1, take the URL from subfield u,
# the kind from subfield 3, link text from subfield y, and restrictions from
# subfield z."
- for datafield in xml.find_all("datafield", tag="856", ind1="4", ind2=["0", "1"]):
+ for datafield in source_record.find_all(
+ "datafield", tag="856", ind1="4", ind2=["0", "1"]
+ ):
url_value = self.create_subfield_value_list_from_datafield(datafield, "u")
text_value = self.create_subfield_value_list_from_datafield(datafield, "y")
restrictions_value = self.create_subfield_value_list_from_datafield(
@@ -398,22 +330,29 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# by leader "Type of Record" position = "Language Material" or "Manuscript
# language material" and "Bibliographic level" position =
# "Monographic component part," "Collection," "Subunit," or "Monograph/Item."
- if leader.string[6:7] in "at" and leader.string[7:8] in "acdm":
- if fixed_length_data.string[33:34] in "0se":
+ if (
+ self._get_leader_field(source_record)[6:7] in "at"
+ and self._get_leader_field(source_record)[7:8] in "acdm"
+ ):
+ if self._get_control_field(source_record)[33:34] in "0se":
fields["literary_form"] = "Nonfiction"
- elif fixed_length_data.string[33:34]:
+ elif self._get_control_field(source_record)[33:34]:
fields["literary_form"] = "Fiction"
# locations
# Get place of publication from 008 field code
- if fixed_location_code := str(fixed_length_data.string)[15:17]: # noqa: SIM102
- if location_name := Marc.loc_crosswalk_code_to_name(
- fixed_location_code, country_code_crosswalk, source_record_id, "country"
- ):
- fields.setdefault("locations", []).append(
- timdex.Location(value=location_name, kind="Place of Publication")
- )
+ if (fixed_location_code := self._get_control_field(source_record)[15:17]) and (
+ location_name := Marc.loc_crosswalk_code_to_name(
+ fixed_location_code,
+ self.country_code_crosswalk,
+ source_record_id,
+ "country",
+ )
+ ):
+ fields.setdefault("locations", []).append(
+ timdex.Location(value=location_name, kind="Place of Publication")
+ )
# Get other locations
location_marc_fields = [
@@ -429,7 +368,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
},
]
for location_marc_field in location_marc_fields:
- for datafield in xml.find_all("datafield", tag=location_marc_field["tag"]):
+ for datafield in source_record.find_all(
+ "datafield", tag=location_marc_field["tag"]
+ ):
if location_value := (
self.create_subfield_value_string_from_datafield(
datafield,
@@ -508,7 +449,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
},
]
for note_marc_field in note_marc_fields:
- for datafield in xml.find_all("datafield", tag=note_marc_field["tag"]):
+ for datafield in source_record.find_all(
+ "datafield", tag=note_marc_field["tag"]
+ ):
if note_value := (
self.create_subfield_value_string_from_datafield(
datafield,
@@ -527,19 +470,19 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
if numbering_values := [
self.create_subfield_value_string_from_datafield(datafield, "a", " ")
- for datafield in xml.find_all("datafield", tag="362")
+ for datafield in source_record.find_all("datafield", tag="362")
]:
fields["numbering"] = " ".join(numbering_values) or None
# physical_description
if physical_description_values := [
self.create_subfield_value_string_from_datafield(datafield, "abcefg", " ")
- for datafield in xml.find_all("datafield", tag="300")
+ for datafield in source_record.find_all("datafield", tag="300")
]:
fields["physical_description"] = " ".join(physical_description_values) or None
# publication_frequency
- for datafield in xml.find_all("datafield", tag="310"):
+ for datafield in source_record.find_all("datafield", tag="310"):
if publication_frequency_value := (
self.create_subfield_value_string_from_datafield(datafield, "a", " ")
):
@@ -549,7 +492,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# publishers
for publisher_marc_field in ["260", "264"]:
- for datafield in xml.find_all("datafield", tag=publisher_marc_field):
+ for datafield in source_record.find_all(
+ "datafield", tag=publisher_marc_field
+ ):
publisher_name = self.get_single_subfield_string(datafield, "b")
publisher_date = self.get_single_subfield_string(datafield, "c")
publisher_location = self.get_single_subfield_string(datafield, "a")
@@ -610,7 +555,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
},
]
for related_item_marc_field in related_item_marc_fields:
- for datafield in xml.find_all(
+ for datafield in source_record.find_all(
"datafield", tag=related_item_marc_field["tag"]
):
if related_item_value := (
@@ -653,7 +598,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
},
]
for subject_marc_field in subject_marc_fields:
- for datafield in xml.find_all("datafield", tag=subject_marc_field["tag"]):
+ for datafield in source_record.find_all(
+ "datafield", tag=subject_marc_field["tag"]
+ ):
if subject_value := (
self.create_subfield_value_string_from_datafield(
datafield,
@@ -669,7 +616,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)
# summary
- for datafield in xml.find_all("datafield", tag="520"):
+ for datafield in source_record.find_all("datafield", tag="520"):
if summary_value := self.create_subfield_value_string_from_datafield(
datafield, "a", " "
):
@@ -783,6 +730,103 @@ def loc_crosswalk_code_to_name(
)
return str(code_element.parent.find("name").string)
+ @classmethod
+ def _get_leader_field(cls, source_record: Tag) -> str:
+ if leader := source_record.find("leader", string=True):
+ return str(leader.string)
+ message = "Record skipped because key information is missing: ."
+ raise SkippedRecordEvent(message)
+
+ @classmethod
+ def _get_control_field(cls, source_record: Tag) -> str:
+ if control_field := source_record.find("controlfield", tag="008", string=True):
+ return str(control_field.string)
+ message = (
+ "Record skipped because key information is missing: "
+ '.'
+ )
+ raise SkippedRecordEvent(message)
+
+ @classmethod
+ def get_alternate_titles(
+ cls, source_record: Tag
+ ) -> list[timdex.AlternateTitle] | None:
+ alternate_titles = []
+ alternate_title_marc_fields = [
+ {
+ "tag": "130",
+ "subfields": "adfghklmnoprst",
+ "kind": "Preferred Title",
+ },
+ {
+ "tag": "240",
+ "subfields": "adfghklmnoprs",
+ "kind": "Preferred Title",
+ },
+ {
+ "tag": "246",
+ "subfields": "abfghinp",
+ "kind": "Varying Form of Title",
+ },
+ {
+ "tag": "730",
+ "subfields": "adfghiklmnoprst",
+ "kind": "Preferred Title",
+ },
+ {
+ "tag": "740",
+ "subfields": "anp",
+ "kind": "Uncontrolled Related/Analytical Title",
+ },
+ ]
+ for alternate_title_marc_field in alternate_title_marc_fields:
+ alternate_titles.extend(
+ [
+ timdex.AlternateTitle(
+ value=alternate_title_value.rstrip(" .,/"),
+ kind=alternate_title_marc_field["kind"],
+ )
+ for datafield in source_record.find_all(
+ "datafield", tag=alternate_title_marc_field["tag"]
+ )
+ if (
+ alternate_title_value := (
+ cls.create_subfield_value_string_from_datafield(
+ datafield,
+ alternate_title_marc_field["subfields"],
+ " ",
+ )
+ )
+ )
+ ]
+ )
+ return alternate_titles or None
+
+ @classmethod
+ def get_call_numbers(cls, source_record: Tag) -> list[str] | None:
+ call_numbers: list = []
+ call_number_marc_fields = [
+ {
+ "tag": "050",
+ "subfields": "a",
+ },
+ {
+ "tag": "082",
+ "subfields": "a",
+ },
+ ]
+ for call_number_marc_field in call_number_marc_fields:
+ for datafield in source_record.find_all(
+ "datafield", tag=call_number_marc_field["tag"]
+ ):
+ call_numbers.extend(
+ call_number
+ for call_number in cls.create_subfield_value_list_from_datafield(
+ datafield, call_number_marc_field["subfields"]
+ )
+ )
+ return call_numbers or None
+
@staticmethod
def get_main_titles(xml: Tag) -> list[str]:
"""