From 61db0f7c9bbf0573040dcdfc2821c8923f6bae31 Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Tue, 9 Jul 2024 09:11:30 -0400 Subject: [PATCH] Field method refactor for Marc transform Why these changes are being introduced: * These updates are required to implement the architecture described in the following ADR: https://github.com/MITLibraries/transmogrifier/blob/main/docs/adrs/0005-field-methods.md How this addresses that need: * Add field methods and corresponding unit tests: alternate_titles, call_numbers * Add private methods for key MARC elements: leader and control field '008' * Rename 'xml' -> 'source_record' * Update dependencies Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-288 --- Pipfile.lock | 146 ++++++++------ tests/sources/xml/test_marc.py | 229 +++++++++++++++++++-- transmogrifier/sources/xml/marc.py | 308 ++++++++++++++++------------- 3 files changed, 476 insertions(+), 207 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index afd4e68..f8cf30a 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -691,41 +691,36 @@ }, "cryptography": { "hashes": [ - "sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad", - "sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583", - "sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b", - "sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c", - "sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1", - "sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648", - "sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949", - "sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba", - "sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c", - "sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9", - "sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d", - "sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c", - "sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e", - "sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2", - "sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d", - "sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7", - "sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70", - "sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2", - "sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7", - "sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14", - "sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe", - "sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e", - "sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71", - "sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961", - "sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7", - "sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c", - "sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28", - "sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842", - "sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902", - "sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801", - "sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a", - "sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e" + "sha256:0663585d02f76929792470451a5ba64424acc3cd5227b03921dab0e2f27b1709", + "sha256:08a24a7070b2b6804c1940ff0f910ff728932a9d0e80e7814234269f9d46d069", + "sha256:232ce02943a579095a339ac4b390fbbe97f5b5d5d107f8a08260ea2768be8cc2", + "sha256:2905ccf93a8a2a416f3ec01b1a7911c3fe4073ef35640e7ee5296754e30b762b", + "sha256:299d3da8e00b7e2b54bb02ef58d73cd5f55fb31f33ebbf33bd00d9aa6807df7e", + "sha256:2c6d112bf61c5ef44042c253e4859b3cbbb50df2f78fa8fae6747a7814484a70", + "sha256:31e44a986ceccec3d0498e16f3d27b2ee5fdf69ce2ab89b52eaad1d2f33d8778", + "sha256:3d9a1eca329405219b605fac09ecfc09ac09e595d6def650a437523fcd08dd22", + "sha256:3dcdedae5c7710b9f97ac6bba7e1052b95c7083c9d0e9df96e02a1932e777895", + "sha256:47ca71115e545954e6c1d207dd13461ab81f4eccfcb1345eac874828b5e3eaaf", + "sha256:4a997df8c1c2aae1e1e5ac49c2e4f610ad037fc5a3aadc7b64e39dea42249431", + "sha256:51956cf8730665e2bdf8ddb8da0056f699c1a5715648c1b0144670c1ba00b48f", + "sha256:5bcb8a5620008a8034d39bce21dc3e23735dfdb6a33a06974739bfa04f853947", + "sha256:64c3f16e2a4fc51c0d06af28441881f98c5d91009b8caaff40cf3548089e9c74", + "sha256:6e2b11c55d260d03a8cf29ac9b5e0608d35f08077d8c087be96287f43af3ccdc", + "sha256:7b3f5fe74a5ca32d4d0f302ffe6680fcc5c28f8ef0dc0ae8f40c0f3a1b4fca66", + "sha256:844b6d608374e7d08f4f6e6f9f7b951f9256db41421917dfb2d003dde4cd6b66", + "sha256:9a8d6802e0825767476f62aafed40532bd435e8a5f7d23bd8b4f5fd04cc80ecf", + "sha256:aae4d918f6b180a8ab8bf6511a419473d107df4dbb4225c7b48c5c9602c38c7f", + "sha256:ac1955ce000cb29ab40def14fd1bbfa7af2017cca696ee696925615cafd0dce5", + "sha256:b88075ada2d51aa9f18283532c9f60e72170041bba88d7f37e49cbb10275299e", + "sha256:cb013933d4c127349b3948aa8aaf2f12c0353ad0eccd715ca789c8a0f671646f", + "sha256:cc70b4b581f28d0a254d006f26949245e3657d40d8857066c2ae22a61222ef55", + "sha256:e9c5266c432a1e23738d178e51c2c7a5e2ddf790f248be939448c0ba2021f9d1", + "sha256:ea9e57f8ea880eeea38ab5abf9fbe39f923544d7884228ec67d666abd60f5a47", + "sha256:ee0c405832ade84d4de74b9029bedb7b31200600fa524d218fc29bfa371e97f5", + "sha256:fdcb265de28585de5b859ae13e3846a8e805268a823a12a4da2597f1f5afc9f0" ], "markers": "python_version >= '3.7'", - "version": "==42.0.8" + "version": "==43.0.0" }, "decorator": { "hashes": [ @@ -921,8 +916,33 @@ }, "mypy": { "hashes": [ + "sha256:0bea2a0e71c2a375c9fa0ede3d98324214d67b3cbbfcbd55ac8f750f85a414e3", + "sha256:104e9c1620c2675420abd1f6c44bab7dd33cc85aea751c985006e83dcd001095", + "sha256:14f9294528b5f5cf96c721f231c9f5b2733164e02c1c018ed1a0eff8a18005ac", + "sha256:1a5d8d8dd8613a3e2be3eae829ee891b6b2de6302f24766ff06cb2875f5be9c6", + "sha256:1d44c1e44a8be986b54b09f15f2c1a66368eb43861b4e82573026e04c48a9e20", "sha256:25bcfa75b9b5a5f8d67147a54ea97ed63a653995a82798221cca2a315c0238c1", - "sha256:56913ec8c7638b0091ef4da6fcc9136896914a9d60d54670a75880c3e5b99ace" + "sha256:35ce88b8ed3a759634cb4eb646d002c4cef0a38f20565ee82b5023558eb90c00", + "sha256:56913ec8c7638b0091ef4da6fcc9136896914a9d60d54670a75880c3e5b99ace", + "sha256:65f190a6349dec29c8d1a1cd4aa71284177aee5949e0502e6379b42873eddbe7", + "sha256:6801319fe76c3f3a3833f2b5af7bd2c17bb93c00026a2a1b924e6762f5b19e13", + "sha256:72596a79bbfb195fd41405cffa18210af3811beb91ff946dbcb7368240eed6be", + "sha256:93743608c7348772fdc717af4aeee1997293a1ad04bc0ea6efa15bf65385c538", + "sha256:940bfff7283c267ae6522ef926a7887305945f716a7704d3344d6d07f02df850", + "sha256:96f8dbc2c85046c81bcddc246232d500ad729cb720da4e20fce3b542cab91287", + "sha256:98790025861cb2c3db8c2f5ad10fc8c336ed2a55f4daf1b8b3f877826b6ff2eb", + "sha256:a3824187c99b893f90c845bab405a585d1ced4ff55421fdf5c84cb7710995229", + "sha256:a83ec98ae12d51c252be61521aa5731f5512231d0b738b4cb2498344f0b840cd", + "sha256:becc9111ca572b04e7e77131bc708480cc88a911adf3d0239f974c034b78085c", + "sha256:c1a184c64521dc549324ec6ef7cbaa6b351912be9cb5edb803c2808a0d7e85ac", + "sha256:c7b73a856522417beb78e0fb6d33ef89474e7a622db2653bc1285af36e2e3e3d", + "sha256:cea3d0fb69637944dd321f41bc896e11d0fb0b0aa531d887a6da70f6e7473aba", + "sha256:d2b3d36baac48e40e3064d2901f2fbd2a2d6880ec6ce6358825c85031d7c0d4d", + "sha256:d7b54c27783991399046837df5c7c9d325d921394757d09dbcbf96aee4649fe9", + "sha256:d8e2e43977f0e09f149ea69fd0556623919f816764e26d74da0c8a7b48f3e18a", + "sha256:dbe286303241fea8c2ea5466f6e0e6a046a135a7e7609167b07fd4e7baf151bf", + "sha256:f006e955718ecd8d159cee9932b64fba8f86ee6f7728ca3ac66c3a54b0062abe", + "sha256:f2268d9fcd9686b61ab64f077be7ffbc6fbcdfb4103e5dd0cc5eaab53a8886c2" ], "index": "pypi", "markers": "python_version >= '3.8'", @@ -1018,10 +1038,10 @@ }, "pure-eval": { "hashes": [ - "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350", - "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3" + "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", + "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42" ], - "version": "==0.2.2" + "version": "==0.2.3" }, "pycparser": { "hashes": [ @@ -1144,12 +1164,12 @@ }, "pytest": { "hashes": [ - "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343", - "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977" + "sha256:7e8e5c5abd6e93cb1cc151f23e57adc31fcf8cfd2a3ff2da63e23f732de35db6", + "sha256:e9600ccf4f563976e2c99fa02c7624ab938296551f280835ee6516df8bc4ae8c" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==8.2.2" + "version": "==8.3.1" }, "pyyaml": { "hashes": [ @@ -1290,28 +1310,28 @@ }, "ruff": { "hashes": [ - "sha256:03bfe9ab5bdc0b08470c3b261643ad54ea86edc32b64d1e080892d7953add3ad", - "sha256:05fbd2cb404775d6cd7f2ff49504e2d20e13ef95fa203bd1ab22413af70d420b", - "sha256:08058d077e21b856d32ebf483443390e29dc44d927608dc8f092ff6776519da9", - "sha256:2a3eb4f1841771fa5b67a56be9c2d16fd3cc88e378bd86aaeaec2f7e6bcdd0a2", - "sha256:642cbff6cbfa38d2566d8db086508d6f472edb136cbfcc4ea65997745368c29e", - "sha256:76bb5a87fd397520b91a83eae8a2f7985236d42dd9459f09eef58e7f5c1d8316", - "sha256:7704582a026fa02cca83efd76671a98ee6eb412c4230209efe5e2a006c06db62", - "sha256:77d49484429ed7c7e6e2e75a753f153b7b58f875bdb4158ad85af166a1ec1822", - "sha256:96066c4328a49fce2dd40e80f7117987369feec30ab771516cf95f1cc2db923c", - "sha256:a8cfc7a26422c78e94f1ec78ec02501bbad2df5834907e75afe474cc6b83a8c1", - "sha256:b12424d9db7347fa63c5ed9af010003338c63c629fb9c9c6adb2aa4f5699729b", - "sha256:b8d72c5684bbd4ed304a9a955ee2e67f57b35f6193222ade910cca8a805490e3", - "sha256:bc697ec874fdd7c7ba0a85ec76ab38f8595224868d67f097c5ffc21136e72fcd", - "sha256:cbaec2ddf4f78e5e9ecf5456ea0f496991358a1d883862ed0b9e947e2b6aea93", - "sha256:cf4bc751240b2fab5d19254571bcacb315c7b0b00bf3c912d52226a82bbec073", - "sha256:d2fc2cdb85ccac1e816cc9d5d8cedefd93661bd957756d902543af32a6b04a71", - "sha256:e791d34d3557a3819b3704bc1f087293c821083fa206812842fa363f6018a192", - "sha256:eafc45dd8bdc37a00b28e68cc038daf3ca8c233d73fea276dcd09defb1352841" + "sha256:029454e2824eafa25b9df46882f7f7844d36fd8ce51c1b7f6d97e2615a57bbcc", + "sha256:09c14ed6a72af9ccc8d2e313d7acf7037f0faff43cde4b507e66f14e812e37f7", + "sha256:0cf497a47751be8c883059c4613ba2f50dd06ec672692de2811f039432875278", + "sha256:2795726d5f71c4f4e70653273d1c23a8182f07dd8e48c12de5d867bfb7557eed", + "sha256:3520a00c0563d7a7a7c324ad7e2cde2355733dafa9592c671fb2e9e3cd8194c1", + "sha256:4c55efbecc3152d614cfe6c2247a3054cfe358cefbf794f8c79c8575456efe19", + "sha256:58b54459221fd3f661a7329f177f091eb35cf7a603f01d9eb3eb11cc348d38c4", + "sha256:628f6b8f97b8bad2490240aa84f3e68f390e13fabc9af5c0d3b96b485921cd60", + "sha256:768fa9208df2bec4b2ce61dbc7c2ddd6b1be9fb48f1f8d3b78b3332c7d71c1ff", + "sha256:82acef724fc639699b4d3177ed5cc14c2a5aacd92edd578a9e846d5b5ec18ddf", + "sha256:93789f14ca2244fb91ed481456f6d0bb8af1f75a330e133b67d08f06ad85b516", + "sha256:9492320eed573a13a0bc09a2957f17aa733fff9ce5bf00e66e6d4a88ec33813f", + "sha256:a6e1f62a92c645e2919b65c02e79d1f61e78a58eddaebca6c23659e7c7cb4ac7", + "sha256:bd53da65f1085fb5b307c38fd3c0829e76acf7b2a912d8d79cadcdb4875c1eb7", + "sha256:da62e87637c8838b325e65beee485f71eb36202ce8e3cdbc24b9fcb8b99a37be", + "sha256:e1e7393e9c56128e870b233c82ceb42164966f25b30f68acbb24ed69ce9c3a4e", + "sha256:e98ad088edfe2f3b85a925ee96da652028f093d6b9b56b76fc242d8abb8e2059", + "sha256:f9b85eaa1f653abd0a70603b8b7008d9e00c9fa1bbd0bf40dad3f0c0bdd06793" ], "index": "pypi", "markers": "python_version >= '3.7'", - "version": "==0.5.3" + "version": "==0.5.4" }, "safety": { "hashes": [ @@ -1332,11 +1352,11 @@ }, "setuptools": { "hashes": [ - "sha256:3d8531791a27056f4a38cd3e54084d8b1c4228ff9cf3f2d7dd075ec99f9fd70d", - "sha256:f501b6e6db709818dc76882582d9c516bf3b67b948864c5fa1d1624c09a49207" + "sha256:032d42ee9fb536e33087fb66cac5f840eb9391ed05637b3f2a76a7c8fb477936", + "sha256:33874fdc59b3188304b2e7c80d9029097ea31627180896fb549c578ceb8a0855" ], "markers": "python_version >= '3.8'", - "version": "==71.0.3" + "version": "==71.1.0" }, "shellingham": { "hashes": [ @@ -1409,4 +1429,4 @@ "version": "==0.2.13" } } -} +} \ No newline at end of file diff --git a/tests/sources/xml/test_marc.py b/tests/sources/xml/test_marc.py index a6913a9..6651903 100644 --- a/tests/sources/xml/test_marc.py +++ b/tests/sources/xml/test_marc.py @@ -1,11 +1,56 @@ +# ruff: noqa: E501, SLF001 import logging +import pytest from bs4 import BeautifulSoup # type: ignore[import-untyped] import transmogrifier.models as timdex +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.sources.xml.marc import Marc +def create_marc_source_record_stub( + leader_field_insert: str = "03282nam 2200721Ki 4500", + control_field_insert: str = ( + '170906s2016 fr mun| o e zxx d' + ), + datafield_insert: str = "", +): + """ + Create source record for unit tests. + + Args: + leader_field_insert (str): A string representing a MARC fixed length 'leader' + XML element. Defaults to a dummy value. + control_field_insert (str): A string representing a MARC fixed length + 'general info control field' (i.e., code 008) XML element. + Defaults to a dummy value. + datafield_insert (str): A string representing a MARC 'datafield' XML element. + + Note: A source record for "missing" field method tests can be created by + setting datafield_insert = "" (the default). + """ + xml_string = """ + + + {leader_field_insert} + {control_field_insert} + 990027185640106761 + {datafield_insert} + + + """ + + return BeautifulSoup( + xml_string.format( + leader_field_insert=leader_field_insert, + control_field_insert=control_field_insert, + datafield_insert=datafield_insert, + ), + "xml", + ) + + def test_marc_record_all_fields_transform_correctly(): marc_xml_records = Marc.parse_source_file( "tests/fixtures/marc/marc_record_all_fields.xml" @@ -748,32 +793,192 @@ def test_marc_record_with_missing_optional_fields_transforms_correctly(): ) -def test_marc_record_missing_leader_logs_error(caplog): +def test_get_leader_field_success(): + source_record = create_marc_source_record_stub() + assert Marc._get_leader_field(source_record) == "03282nam 2200721Ki 4500" + + +def test_get_leader_field_raises_skipped_record_event_if_field_blank(): + source_record = create_marc_source_record_stub( + leader_field_insert="" + ) + with pytest.raises( + SkippedRecordEvent, + match=("Record skipped because key information is missing: ."), + ): + Marc._get_leader_field(source_record) + + +def test_get_leader_field_raises_skipped_record_event_if_field_missing(): + source_record = create_marc_source_record_stub(leader_field_insert="") + with pytest.raises( + SkippedRecordEvent, + match=("Record skipped because key information is missing: ."), + ): + Marc._get_leader_field(source_record) + + +def test_get_control_field_success(): + source_record = create_marc_source_record_stub() + assert Marc._get_control_field(source_record) == ( + "170906s2016 fr mun| o e zxx d" + ) + + +def test_get_control_field_raises_skipped_record_event_if_field_blank(): + source_record = create_marc_source_record_stub( + control_field_insert='' + ) + with pytest.raises( + SkippedRecordEvent, + match=( + 'Record skipped because key information is missing: .' + ), + ): + Marc._get_control_field(source_record) + + +def test_get_control_field_raises_skipped_record_event_if_field_missing(): + source_record = create_marc_source_record_stub(control_field_insert="") + with pytest.raises( + SkippedRecordEvent, + match=( + 'Record skipped because key information is missing: .' + ), + ): + Marc._get_control_field(source_record) + + +def test_get_alternate_titles_success(): + source_record = create_marc_source_record_stub( + datafield_insert=( + """ + + Main Entry + Date 1 + Date 2 + + + Uniform + Date 1 + Date 2 + + + Varying Form + Of Title 1. + + + Added Entry 2 + Part 1 + Part 2 + + + Added Entry 1 + Part 1 + Part 2 + + """ + ) + ) + assert Marc.get_alternate_titles(source_record) == [ + timdex.AlternateTitle(value="Main Entry Date 1 Date 2", kind="Preferred Title"), + timdex.AlternateTitle(value="Uniform Date 1 Date 2", kind="Preferred Title"), + timdex.AlternateTitle( + value="Varying Form Of Title 1", kind="Varying Form of Title" + ), + timdex.AlternateTitle( + value="Added Entry 2 Part 1 Part 2", kind="Preferred Title" + ), + timdex.AlternateTitle( + value="Added Entry 1 Part 1 Part 2", + kind="Uncontrolled Related/Analytical Title", + ), + ] + + +def test_get_alternate_titles_transforms_correctly_if_fields_blank(): + source_record = create_marc_source_record_stub( + datafield_insert=( + """ + + + + + + """ + ) + ) + assert Marc.get_alternate_titles(source_record) is None + + +def test_get_alternate_titles_transforms_correctly_if_fields_missing(): + source_record = create_marc_source_record_stub() + assert Marc.get_alternate_titles(source_record) is None + + +def test_get_call_numbers_success(): + source_record = create_marc_source_record_stub( + datafield_insert=( + """ + + MA123.4 + LC Call Number 2 + + + 123.45 + Dewey Call Number 2 + + + Dewey Call Number 3 + + """ + ) + ) + assert Marc.get_call_numbers(source_record) == [ + "MA123.4", + "LC Call Number 2", + "123.45", + "Dewey Call Number 2", + "Dewey Call Number 3", + ] + + +def test_get_call_numbers_transforms_correctly_if_fields_blank(): + source_record = create_marc_source_record_stub( + datafield_insert=( + """ + + + + """ + ) + ) + assert Marc.get_call_numbers(source_record) is None + + +def test_get_call_numbers_transforms_correctly_if_fields_missing(): + source_record = create_marc_source_record_stub() + assert Marc.get_call_numbers(source_record) is None + + +def test_marc_record_missing_leader_skips_record(caplog): marc_xml_records = Marc.parse_source_file( "tests/fixtures/marc/marc_record_missing_leader.xml" ) output_records = Marc("alma", marc_xml_records) assert len(list(output_records)) == 0 assert output_records.processed_record_count == 1 - assert ( - "transmogrifier.sources.xml.marc", - logging.ERROR, - "Record ID 990027185640106761 is missing MARC leader", - ) in caplog.record_tuples + assert output_records.skipped_record_count == 1 -def test_marc_record_missing_008_logs_error(caplog): +def test_marc_record_missing_008_skips_record(caplog): marc_xml_records = Marc.parse_source_file( "tests/fixtures/marc/marc_record_missing_008.xml" ) output_records = Marc("alma", marc_xml_records) assert len(list(output_records)) == 0 assert output_records.processed_record_count == 1 - assert ( - "transmogrifier.sources.xml.marc", - logging.ERROR, - "Record ID 990027185640106761 is missing MARC 008 field", - ) in caplog.record_tuples + assert output_records.skipped_record_count == 1 def test_create_subfield_value_list_from_datafield_with_values(): diff --git a/transmogrifier/sources/xml/marc.py b/transmogrifier/sources/xml/marc.py index 098fffc..4b22fae 100644 --- a/transmogrifier/sources/xml/marc.py +++ b/transmogrifier/sources/xml/marc.py @@ -4,137 +4,63 @@ import transmogrifier.models as timdex from transmogrifier.config import load_external_config +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.helpers import validate_date from transmogrifier.sources.xmltransformer import XMLTransformer logger = logging.getLogger(__name__) -country_code_crosswalk = load_external_config("config/loc-countries.xml", "xml") - -holdings_collection_crosswalk = load_external_config( - "config/holdings_collection_crosswalk.json", "json" -) -holdings_format_crosswalk = load_external_config( - "config/holdings_format_crosswalk.json", "json" -) -holdings_location_crosswalk = load_external_config( - "config/holdings_location_crosswalk.json", "json" -) - -language_code_crosswalk = load_external_config("config/loc-languages.xml", "xml") - -marc_content_type_crosswalk = load_external_config( - "config/marc_content_type_crosswalk.json", "json" -) - - class Marc(XMLTransformer): """Marc transformer.""" - def get_optional_fields(self, xml: Tag) -> dict | None: + country_code_crosswalk = load_external_config("config/loc-countries.xml", "xml") + holdings_collection_crosswalk = load_external_config( + "config/holdings_collection_crosswalk.json", "json" + ) + holdings_format_crosswalk = load_external_config( + "config/holdings_format_crosswalk.json", "json" + ) + holdings_location_crosswalk = load_external_config( + "config/holdings_location_crosswalk.json", "json" + ) + language_code_crosswalk = load_external_config("config/loc-languages.xml", "xml") + marc_content_type_crosswalk = load_external_config( + "config/marc_content_type_crosswalk.json", "json" + ) + + def get_optional_fields(self, source_record: Tag) -> dict | None: """ Retrieve optional TIMDEX fields from a MARC XML record. Overrides metaclass get_optional_fields() method. Args: - xml: A BeautifulSoup Tag representing a single MARC XML record. + source_record: A BeautifulSoup Tag representing a single MARC XML record. """ fields: dict = {} - source_record_id = Marc.get_source_record_id(xml) - - fixed_length_data = xml.find("controlfield", tag="008", string=True) - if fixed_length_data is None: - message = f"Record ID {source_record_id} is missing MARC 008 field" - logger.error(message) - return None + source_record_id = self.get_source_record_id(source_record) - leader = xml.find("leader", string=True) - if leader is None: - message = f"Record ID {source_record_id} is missing MARC leader" - logger.error(message) - return None - - # alternate_titles - alternate_title_marc_fields = [ - { - "tag": "130", - "subfields": "adfghklmnoprst", - "kind": "Preferred Title", - }, - { - "tag": "240", - "subfields": "adfghklmnoprs", - "kind": "Preferred Title", - }, - { - "tag": "246", - "subfields": "abfghinp", - "kind": "Varying Form of Title", - }, - { - "tag": "730", - "subfields": "adfghiklmnoprst", - "kind": "Preferred Title", - }, - { - "tag": "740", - "subfields": "anp", - "kind": "Uncontrolled Related/Analytical Title", - }, - ] - for alternate_title_marc_field in alternate_title_marc_fields: - for datafield in xml.find_all( - "datafield", tag=alternate_title_marc_field["tag"] - ): - if alternate_title_value := ( - self.create_subfield_value_string_from_datafield( - datafield, - alternate_title_marc_field["subfields"], - " ", - ) - ): - fields.setdefault("alternate_titles", []).append( - timdex.AlternateTitle( - value=alternate_title_value.rstrip(" .,/"), - kind=alternate_title_marc_field["kind"], - ) - ) + # alternate titles + fields["alternate_titles"] = self.get_alternate_titles(source_record) # call_numbers - call_number_marc_fields = [ - { - "tag": "050", - "subfields": "a", - }, - { - "tag": "082", - "subfields": "a", - }, - ] - for call_number_marc_field in call_number_marc_fields: - for datafield in xml.find_all("datafield", tag=call_number_marc_field["tag"]): - for call_number_value in self.create_subfield_value_list_from_datafield( - datafield, - call_number_marc_field["subfields"], - ): - fields.setdefault("call_numbers", []).append(call_number_value) + fields["call_numbers"] = self.get_call_numbers(source_record) # citation not used in MARC # content_type if content_type := Marc.json_crosswalk_code_to_name( - str(leader.string)[6:7], - marc_content_type_crosswalk, + self._get_leader_field(source_record)[6:7], + self.marc_content_type_crosswalk, source_record_id, "Leader/06", ): fields["content_type"] = [content_type] # contents - for datafield in xml.find_all("datafield", tag="505"): + for datafield in source_record.find_all("datafield", tag="505"): for contents_value in self.create_subfield_value_list_from_datafield( datafield, "agrt", @@ -171,7 +97,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ] contributor_values = [] for contributor_marc_field in contributor_marc_fields: - for datafield in xml.find_all("datafield", tag=contributor_marc_field["tag"]): + for datafield in source_record.find_all( + "datafield", tag=contributor_marc_field["tag"] + ): if contributor_value := ( self.create_subfield_value_string_from_datafield( datafield, @@ -203,7 +131,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: fields["contributors"] = contributor_values or None # dates - publication_year = str(fixed_length_data.string)[7:11].strip() + publication_year = self._get_control_field(source_record)[7:11].strip() if validate_date(publication_year, source_record_id): fields["dates"] = [ timdex.Date(kind="Publication date", value=publication_year) @@ -211,7 +139,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # edition edition_values = [] - for datafield in xml.find_all("datafield", tag="250"): + for datafield in source_record.find_all("datafield", tag="250"): if edition_value := self.create_subfield_value_string_from_datafield( datafield, "ab", " " ): @@ -226,25 +154,25 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # holdings # physical items - for datafield in xml.find_all("datafield", tag="985"): + for datafield in source_record.find_all("datafield", tag="985"): holding_call_number_value = self.create_subfield_value_string_from_datafield( datafield, ["bb"] ) holding_collection_value = Marc.json_crosswalk_code_to_name( self.create_subfield_value_string_from_datafield(datafield, ["aa"]), - holdings_collection_crosswalk, + self.holdings_collection_crosswalk, source_record_id, "985 $aa", ) holding_format_value = Marc.json_crosswalk_code_to_name( self.create_subfield_value_string_from_datafield(datafield, "t"), - holdings_format_crosswalk, + self.holdings_format_crosswalk, source_record_id, "985 $t", ) holding_location_value = Marc.json_crosswalk_code_to_name( self.create_subfield_value_string_from_datafield(datafield, "i"), - holdings_location_crosswalk, + self.holdings_location_crosswalk, source_record_id, "985 $i", ) @@ -268,7 +196,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) ) # electronic portfolio items - for field_986 in xml.find_all("datafield", tag="986"): + for field_986 in source_record.find_all("datafield", tag="986"): electronic_item_collection = self.get_single_subfield_string(field_986, "j") electronic_item_location = ( self.get_single_subfield_string(field_986, "f") @@ -330,7 +258,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: }, ] for identifier_marc_field in identifier_marc_fields: - for datafield in xml.find_all("datafield", tag=identifier_marc_field["tag"]): + for datafield in source_record.find_all( + "datafield", tag=identifier_marc_field["tag"] + ): if identifier_value := ( self.create_subfield_value_string_from_datafield( datafield, @@ -350,9 +280,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # Get language codes language_codes = [] - if fixed_language_value := str(fixed_length_data.string)[35:38]: + if fixed_language_value := self._get_control_field(source_record)[35:38]: language_codes.append(fixed_language_value) - for field_041 in xml.find_all("datafield", tag="041"): + for field_041 in source_record.find_all("datafield", tag="041"): language_codes.extend( self.create_subfield_value_list_from_datafield(field_041, "abdefghjkmn") ) @@ -360,12 +290,12 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # Crosswalk codes to names for language_code in list(dict.fromkeys(language_codes)): if language_name := Marc.loc_crosswalk_code_to_name( - language_code, language_code_crosswalk, source_record_id, "language" + language_code, self.language_code_crosswalk, source_record_id, "language" ): languages.append(language_name) # noqa: PERF401 # Add language notes - for field_546 in xml.find_all("datafield", tag="546"): + for field_546 in source_record.find_all("datafield", tag="546"): if language_note := field_546.find("subfield", code="a", string=True): languages.append(str(language_note.string).rstrip(" .")) # noqa: PERF401 @@ -375,7 +305,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # If indicator 1 is 4 and indicator 2 is 0 or 1, take the URL from subfield u, # the kind from subfield 3, link text from subfield y, and restrictions from # subfield z." - for datafield in xml.find_all("datafield", tag="856", ind1="4", ind2=["0", "1"]): + for datafield in source_record.find_all( + "datafield", tag="856", ind1="4", ind2=["0", "1"] + ): url_value = self.create_subfield_value_list_from_datafield(datafield, "u") text_value = self.create_subfield_value_list_from_datafield(datafield, "y") restrictions_value = self.create_subfield_value_list_from_datafield( @@ -398,22 +330,29 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # by leader "Type of Record" position = "Language Material" or "Manuscript # language material" and "Bibliographic level" position = # "Monographic component part," "Collection," "Subunit," or "Monograph/Item." - if leader.string[6:7] in "at" and leader.string[7:8] in "acdm": - if fixed_length_data.string[33:34] in "0se": + if ( + self._get_leader_field(source_record)[6:7] in "at" + and self._get_leader_field(source_record)[7:8] in "acdm" + ): + if self._get_control_field(source_record)[33:34] in "0se": fields["literary_form"] = "Nonfiction" - elif fixed_length_data.string[33:34]: + elif self._get_control_field(source_record)[33:34]: fields["literary_form"] = "Fiction" # locations # Get place of publication from 008 field code - if fixed_location_code := str(fixed_length_data.string)[15:17]: # noqa: SIM102 - if location_name := Marc.loc_crosswalk_code_to_name( - fixed_location_code, country_code_crosswalk, source_record_id, "country" - ): - fields.setdefault("locations", []).append( - timdex.Location(value=location_name, kind="Place of Publication") - ) + if (fixed_location_code := self._get_control_field(source_record)[15:17]) and ( + location_name := Marc.loc_crosswalk_code_to_name( + fixed_location_code, + self.country_code_crosswalk, + source_record_id, + "country", + ) + ): + fields.setdefault("locations", []).append( + timdex.Location(value=location_name, kind="Place of Publication") + ) # Get other locations location_marc_fields = [ @@ -429,7 +368,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: }, ] for location_marc_field in location_marc_fields: - for datafield in xml.find_all("datafield", tag=location_marc_field["tag"]): + for datafield in source_record.find_all( + "datafield", tag=location_marc_field["tag"] + ): if location_value := ( self.create_subfield_value_string_from_datafield( datafield, @@ -508,7 +449,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: }, ] for note_marc_field in note_marc_fields: - for datafield in xml.find_all("datafield", tag=note_marc_field["tag"]): + for datafield in source_record.find_all( + "datafield", tag=note_marc_field["tag"] + ): if note_value := ( self.create_subfield_value_string_from_datafield( datafield, @@ -527,19 +470,19 @@ def get_optional_fields(self, xml: Tag) -> dict | None: if numbering_values := [ self.create_subfield_value_string_from_datafield(datafield, "a", " ") - for datafield in xml.find_all("datafield", tag="362") + for datafield in source_record.find_all("datafield", tag="362") ]: fields["numbering"] = " ".join(numbering_values) or None # physical_description if physical_description_values := [ self.create_subfield_value_string_from_datafield(datafield, "abcefg", " ") - for datafield in xml.find_all("datafield", tag="300") + for datafield in source_record.find_all("datafield", tag="300") ]: fields["physical_description"] = " ".join(physical_description_values) or None # publication_frequency - for datafield in xml.find_all("datafield", tag="310"): + for datafield in source_record.find_all("datafield", tag="310"): if publication_frequency_value := ( self.create_subfield_value_string_from_datafield(datafield, "a", " ") ): @@ -549,7 +492,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # publishers for publisher_marc_field in ["260", "264"]: - for datafield in xml.find_all("datafield", tag=publisher_marc_field): + for datafield in source_record.find_all( + "datafield", tag=publisher_marc_field + ): publisher_name = self.get_single_subfield_string(datafield, "b") publisher_date = self.get_single_subfield_string(datafield, "c") publisher_location = self.get_single_subfield_string(datafield, "a") @@ -610,7 +555,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: }, ] for related_item_marc_field in related_item_marc_fields: - for datafield in xml.find_all( + for datafield in source_record.find_all( "datafield", tag=related_item_marc_field["tag"] ): if related_item_value := ( @@ -653,7 +598,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: }, ] for subject_marc_field in subject_marc_fields: - for datafield in xml.find_all("datafield", tag=subject_marc_field["tag"]): + for datafield in source_record.find_all( + "datafield", tag=subject_marc_field["tag"] + ): if subject_value := ( self.create_subfield_value_string_from_datafield( datafield, @@ -669,7 +616,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) # summary - for datafield in xml.find_all("datafield", tag="520"): + for datafield in source_record.find_all("datafield", tag="520"): if summary_value := self.create_subfield_value_string_from_datafield( datafield, "a", " " ): @@ -783,6 +730,103 @@ def loc_crosswalk_code_to_name( ) return str(code_element.parent.find("name").string) + @classmethod + def _get_leader_field(cls, source_record: Tag) -> str: + if leader := source_record.find("leader", string=True): + return str(leader.string) + message = "Record skipped because key information is missing: ." + raise SkippedRecordEvent(message) + + @classmethod + def _get_control_field(cls, source_record: Tag) -> str: + if control_field := source_record.find("controlfield", tag="008", string=True): + return str(control_field.string) + message = ( + "Record skipped because key information is missing: " + '.' + ) + raise SkippedRecordEvent(message) + + @classmethod + def get_alternate_titles( + cls, source_record: Tag + ) -> list[timdex.AlternateTitle] | None: + alternate_titles = [] + alternate_title_marc_fields = [ + { + "tag": "130", + "subfields": "adfghklmnoprst", + "kind": "Preferred Title", + }, + { + "tag": "240", + "subfields": "adfghklmnoprs", + "kind": "Preferred Title", + }, + { + "tag": "246", + "subfields": "abfghinp", + "kind": "Varying Form of Title", + }, + { + "tag": "730", + "subfields": "adfghiklmnoprst", + "kind": "Preferred Title", + }, + { + "tag": "740", + "subfields": "anp", + "kind": "Uncontrolled Related/Analytical Title", + }, + ] + for alternate_title_marc_field in alternate_title_marc_fields: + alternate_titles.extend( + [ + timdex.AlternateTitle( + value=alternate_title_value.rstrip(" .,/"), + kind=alternate_title_marc_field["kind"], + ) + for datafield in source_record.find_all( + "datafield", tag=alternate_title_marc_field["tag"] + ) + if ( + alternate_title_value := ( + cls.create_subfield_value_string_from_datafield( + datafield, + alternate_title_marc_field["subfields"], + " ", + ) + ) + ) + ] + ) + return alternate_titles or None + + @classmethod + def get_call_numbers(cls, source_record: Tag) -> list[str] | None: + call_numbers: list = [] + call_number_marc_fields = [ + { + "tag": "050", + "subfields": "a", + }, + { + "tag": "082", + "subfields": "a", + }, + ] + for call_number_marc_field in call_number_marc_fields: + for datafield in source_record.find_all( + "datafield", tag=call_number_marc_field["tag"] + ): + call_numbers.extend( + call_number + for call_number in cls.create_subfield_value_list_from_datafield( + datafield, call_number_marc_field["subfields"] + ) + ) + return call_numbers or None + @staticmethod def get_main_titles(xml: Tag) -> list[str]: """