From c3e3429b2f723459674f1db3b28a9798ef2892d3 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Tue, 3 Sep 2024 09:22:58 -0400 Subject: [PATCH] SPIKE: benchmark lxml parsing --- transmogrifier/config.py | 3 +- transmogrifier/sources/transformer.py | 8 ++ transmogrifier/sources/xml/marc_v2.py | 130 +++++++++++++++++++++++ transmogrifier/sources/xmltransformer.py | 22 +++- 4 files changed, 159 insertions(+), 4 deletions(-) create mode 100644 transmogrifier/sources/xml/marc_v2.py diff --git a/transmogrifier/config.py b/transmogrifier/config.py index deeab5a..2c9e341 100644 --- a/transmogrifier/config.py +++ b/transmogrifier/config.py @@ -82,7 +82,8 @@ "https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?" "vid=01MIT_INST:MIT&docid=alma" ), - "transform-class": "transmogrifier.sources.xml.marc.Marc", + # "transform-class": "transmogrifier.sources.xml.marc.Marc", + "transform-class": "transmogrifier.sources.xml.marc_v2.MarcV2", # DEBUG: V2 }, "aspace": { "name": "MIT ArchivesSpace", diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py index d0da1a7..5820e0b 100644 --- a/transmogrifier/sources/transformer.py +++ b/transmogrifier/sources/transformer.py @@ -26,6 +26,9 @@ JSON: TypeAlias = dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None +# DEBUG: shim to use env var to get parsing approach +PARSER = os.getenv("PARSER", "bs4") + class Transformer(ABC): """Base transformer class.""" @@ -162,12 +165,15 @@ def _write_timdex_records_to_json_file(self, output_file: str) -> int: Args: output_file: The JSON file used for writing TIMDEX records. """ + import time + count = 0 try: record: timdex.TimdexRecord = next(self) except StopIteration: return count with smart_open.open(output_file, "w") as file: + t0 = time.time() file.write("[\n") while record: file.write( @@ -185,6 +191,8 @@ def _write_timdex_records_to_json_file(self, output_file: str) -> int: "Status update: %s records written to output file so far!", count, ) + logger.info(f"Batch elapsed: {time.time()-t0}, parser: {PARSER}") + t0 = time.time() try: record: timdex.TimdexRecord = next(self) # type: ignore[no-redef] except StopIteration: diff --git a/transmogrifier/sources/xml/marc_v2.py b/transmogrifier/sources/xml/marc_v2.py new file mode 100644 index 0000000..f6fa852 --- /dev/null +++ b/transmogrifier/sources/xml/marc_v2.py @@ -0,0 +1,130 @@ +import logging +import os + +from bs4 import Tag # type: ignore[import-untyped] + +from transmogrifier.sources.xmltransformer import XMLTransformer + +logger = logging.getLogger(__name__) + +# DEBUG: shim to use env var to get parsing approach +PARSER = os.getenv("PARSER", "bs4") + +SIMULATE_NUM = 50 + + +class MarcV2(XMLTransformer): + """Marc transformer.""" + + # DEBUG: REQUIRED + @classmethod + def get_main_titles(cls, source_record: Tag) -> list[str]: + """ + Arbitrary field method to simulate lots of realistic data parsing via BS4 or lxml. + + For each, a `for x in range(0, 50)` is added to simulate other field methods + parsing data from the record. This is probably a high number, maybe a typical + record only has 20-30 calls for data, but it exposes the difference between BS4 + and lxml. + + The final result is a technically valid record with a title pulled. + """ + if PARSER == "bs4": + subfield = None + for x in range(0, SIMULATE_NUM): + try: + element = source_record.find("datafield", tag="245") + for subfield in element.find_all(name=True, string=True): + if subfield.get("code", "") == "a": + break + except AttributeError: + logger.exception( + "Record ID %s is missing a 245 field", + cls.get_source_record_id(source_record), + ) + return [] + + if subfield is not None: + return [str(subfield.string)] + else: + return [] + + elif PARSER == "lxml": + + # DEBUG ############################## + # DEBUG: XPath: slow + # NOTE: this demonstrates using lxml.element.xpath, and is 7-8x times slower + # than using element.iter() + # DEBUG ############################## + # for x in range(0, SIMULATE_NUM): + # e = source_record.xpath("//datafield[@tag=245]/subfield[@code='a']")[0] + # return [e.text] + + # DEBUG ############################## + # DEBUG: element.iter: fast + # DEBUG ############################## + # e = None + # for x in range(0, SIMULATE_NUM): + # for element in source_record.iter("datafield"): + # if element.attrib.get("tag") == "245": + # for subfield in element.iter("subfield"): + # if subfield.attrib.get("code") == "a": + # e = subfield + # break + # if e is not None: + # return [e.text] + # else: + # return [] + + # DEBUG ############################## + # DEBUG: find: slow (uses XPath) + # NOTE: this demonstrates using lxml.element.find, which is quite fast + # NOTE: while it looks like XPath, it's not the full implementation, so can + # be somewhat tricky to use; unsure what exactly is supported + # https://docs.python.org/2/library/xml.etree.elementtree.html#elementtree-xpath + # DEBUG ############################## + e = None + for x in range(0, SIMULATE_NUM): + e = source_record.find(".//datafield[@tag='245']/subfield[@code='a']") + if e is not None: + return [e.text] + else: + return [] + + @classmethod + def get_source_record_id(cls, source_record: Tag) -> str: + """ + Get the source record ID from a MARC XML record. + + Overrides metaclass get_source_record_id() method. + + Args: + source_record: A BeautifulSoup Tag representing a single MARC XML record. + """ + + if PARSER == "bs4": + return str(source_record.find("controlfield", tag="001", string=True).string) + + elif PARSER == "lxml": + return source_record.xpath("//controlfield[@tag=001]")[0].text + + @classmethod + def record_is_deleted(cls, source_record: Tag) -> bool: + """ + Determine whether record has a status of deleted. + + Overrides metaclass record_is_deleted() method. + + Args: + source_record: A BeautifulSoup Tag representing a single MARC XML record + """ + if PARSER == "bs4": + if leader := source_record.find("leader", string=True): # noqa: SIM102 + if leader.string[5:6] == "d": + return True + return False + + elif PARSER == "lxml": + if source_record.xpath("//leader")[0].text[5:6] == "d": + return True + return False diff --git a/transmogrifier/sources/xmltransformer.py b/transmogrifier/sources/xmltransformer.py index dc37ed7..e5b5e4c 100644 --- a/transmogrifier/sources/xmltransformer.py +++ b/transmogrifier/sources/xmltransformer.py @@ -1,5 +1,5 @@ from __future__ import annotations - +import os from typing import TYPE_CHECKING, final import smart_open # type: ignore[import-untyped] @@ -11,6 +11,9 @@ if TYPE_CHECKING: from collections.abc import Iterator +# DEBUG: shim to use env var to get parsing approach +PARSER = os.getenv("PARSER", "bs4") + class XMLTransformer(Transformer): """XML transformer class.""" @@ -33,10 +36,22 @@ def parse_source_file(cls, source_file: str) -> Iterator[Tag]: encoding="utf-8", recover=True, ): - record_string = etree.tostring(element, encoding="utf-8") - record = BeautifulSoup(record_string, "xml") + if PARSER == "bs4": + record_string = etree.tostring(element, encoding="utf-8") + record = BeautifulSoup(record_string, "xml") + + elif PARSER == "lxml": + record = element + + else: + raise Exception("parser type not recognized") + yield record element.clear() + # DEBUG newly added ################################## + while element.getprevious() is not None: + del element.getparent()[0] + # DEBUG newly added ################################## @classmethod def get_main_titles(cls, _source_record: Tag) -> list[Tag]: @@ -92,6 +107,7 @@ def get_source_record_id(cls, source_record: Tag) -> str: Args: source_record: A BeautifulSoup Tag representing a single XML record. """ + return str(source_record.header.find("identifier").string) @classmethod