SPIKE: benchmark lxml parsing

MITLibraries · Sep 3, 2024 · c3e3429 · c3e3429
1 parent 395e612
commit c3e3429
Show file tree

Hide file tree

Showing 4 changed files with 159 additions and 4 deletions.
diff --git a/transmogrifier/config.py b/transmogrifier/config.py
@@ -82,7 +82,8 @@
             "https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?"
             "vid=01MIT_INST:MIT&docid=alma"
         ),
-        "transform-class": "transmogrifier.sources.xml.marc.Marc",
+        # "transform-class": "transmogrifier.sources.xml.marc.Marc",
+        "transform-class": "transmogrifier.sources.xml.marc_v2.MarcV2",  # DEBUG: V2
     },
     "aspace": {
         "name": "MIT ArchivesSpace",

diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py
@@ -26,6 +26,9 @@
 
 JSON: TypeAlias = dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None
 
+# DEBUG: shim to use env var to get parsing approach
+PARSER = os.getenv("PARSER", "bs4")
+
 
 class Transformer(ABC):
     """Base transformer class."""
@@ -162,12 +165,15 @@ def _write_timdex_records_to_json_file(self, output_file: str) -> int:
         Args:
             output_file: The JSON file used for writing TIMDEX records.
         """
+        import time
+
         count = 0
         try:
             record: timdex.TimdexRecord = next(self)
         except StopIteration:
             return count
         with smart_open.open(output_file, "w") as file:
+            t0 = time.time()
             file.write("[\n")
             while record:
                 file.write(
@@ -185,6 +191,8 @@ def _write_timdex_records_to_json_file(self, output_file: str) -> int:
                         "Status update: %s records written to output file so far!",
                         count,
                     )
+                    logger.info(f"Batch elapsed: {time.time()-t0}, parser: {PARSER}")
+                    t0 = time.time()
                 try:
                     record: timdex.TimdexRecord = next(self)  # type: ignore[no-redef]
                 except StopIteration:

diff --git a/transmogrifier/sources/xml/marc_v2.py b/transmogrifier/sources/xml/marc_v2.py
@@ -0,0 +1,130 @@
+import logging
+import os
+
+from bs4 import Tag  # type: ignore[import-untyped]
+
+from transmogrifier.sources.xmltransformer import XMLTransformer
+
+logger = logging.getLogger(__name__)
+
+# DEBUG: shim to use env var to get parsing approach
+PARSER = os.getenv("PARSER", "bs4")
+
+SIMULATE_NUM = 50
+
+
+class MarcV2(XMLTransformer):
+    """Marc transformer."""
+
+    # DEBUG: REQUIRED
+    @classmethod
+    def get_main_titles(cls, source_record: Tag) -> list[str]:
+        """
+        Arbitrary field method to simulate lots of realistic data parsing via BS4 or lxml.
+
+        For each, a `for x in range(0, 50)` is added to simulate other field methods
+        parsing data from the record.  This is probably a high number, maybe a typical
+        record only has 20-30 calls for data, but it exposes the difference between BS4
+        and lxml.
+
+        The final result is a technically valid record with a title pulled.
+        """
+        if PARSER == "bs4":
+            subfield = None
+            for x in range(0, SIMULATE_NUM):
+                try:
+                    element = source_record.find("datafield", tag="245")
+                    for subfield in element.find_all(name=True, string=True):
+                        if subfield.get("code", "") == "a":
+                            break
+                except AttributeError:
+                    logger.exception(
+                        "Record ID %s is missing a 245 field",
+                        cls.get_source_record_id(source_record),
+                    )
+                    return []
+
+            if subfield is not None:
+                return [str(subfield.string)]
+            else:
+                return []
+
+        elif PARSER == "lxml":
+
+            # DEBUG ##############################
+            # DEBUG: XPath: slow
+            # NOTE: this demonstrates using lxml.element.xpath, and is 7-8x times slower
+            #  than using element.iter()
+            # DEBUG ##############################
+            # for x in range(0, SIMULATE_NUM):
+            #     e = source_record.xpath("//datafield[@tag=245]/subfield[@code='a']")[0]
+            # return [e.text]
+
+            # DEBUG ##############################
+            # DEBUG: element.iter: fast
+            # DEBUG ##############################
+            # e = None
+            # for x in range(0, SIMULATE_NUM):
+            #     for element in source_record.iter("datafield"):
+            #         if element.attrib.get("tag") == "245":
+            #             for subfield in element.iter("subfield"):
+            #                 if subfield.attrib.get("code") == "a":
+            #                     e = subfield
+            #                     break
+            # if e is not None:
+            #     return [e.text]
+            # else:
+            #     return []
+
+            # DEBUG ##############################
+            # DEBUG: find: slow (uses XPath)
+            # NOTE: this demonstrates using lxml.element.find, which is quite fast
+            # NOTE: while it looks like XPath, it's not the full implementation, so can
+            #  be somewhat tricky to use; unsure what exactly is supported
+            #  https://docs.python.org/2/library/xml.etree.elementtree.html#elementtree-xpath
+            # DEBUG ##############################
+            e = None
+            for x in range(0, SIMULATE_NUM):
+                e = source_record.find(".//datafield[@tag='245']/subfield[@code='a']")
+            if e is not None:
+                return [e.text]
+            else:
+                return []
+
+    @classmethod
+    def get_source_record_id(cls, source_record: Tag) -> str:
+        """
+        Get the source record ID from a MARC XML record.
+
+        Overrides metaclass get_source_record_id() method.
+
+        Args:
+            source_record: A BeautifulSoup Tag representing a single MARC XML record.
+        """
+
+        if PARSER == "bs4":
+            return str(source_record.find("controlfield", tag="001", string=True).string)
+
+        elif PARSER == "lxml":
+            return source_record.xpath("//controlfield[@tag=001]")[0].text
+
+    @classmethod
+    def record_is_deleted(cls, source_record: Tag) -> bool:
+        """
+        Determine whether record has a status of deleted.
+
+        Overrides metaclass record_is_deleted() method.
+
+        Args:
+            source_record: A BeautifulSoup Tag representing a single MARC XML record
+        """
+        if PARSER == "bs4":
+            if leader := source_record.find("leader", string=True):  # noqa: SIM102
+                if leader.string[5:6] == "d":
+                    return True
+            return False
+
+        elif PARSER == "lxml":
+            if source_record.xpath("//leader")[0].text[5:6] == "d":
+                return True
+            return False
diff --git a/transmogrifier/sources/xmltransformer.py b/transmogrifier/sources/xmltransformer.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-
+import os
 from typing import TYPE_CHECKING, final
 
 import smart_open  # type: ignore[import-untyped]
@@ -11,6 +11,9 @@
 if TYPE_CHECKING:
     from collections.abc import Iterator
 
+# DEBUG: shim to use env var to get parsing approach
+PARSER = os.getenv("PARSER", "bs4")
+
 
 class XMLTransformer(Transformer):
     """XML transformer class."""
@@ -33,10 +36,22 @@ def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
                 encoding="utf-8",
                 recover=True,
             ):
-                record_string = etree.tostring(element, encoding="utf-8")
-                record = BeautifulSoup(record_string, "xml")
+                if PARSER == "bs4":
+                    record_string = etree.tostring(element, encoding="utf-8")
+                    record = BeautifulSoup(record_string, "xml")
+
+                elif PARSER == "lxml":
+                    record = element
+
+                else:
+                    raise Exception("parser type not recognized")
+
                 yield record
                 element.clear()
+                # DEBUG newly added ##################################
+                while element.getprevious() is not None:
+                    del element.getparent()[0]
+                # DEBUG newly added ##################################
 
     @classmethod
     def get_main_titles(cls, _source_record: Tag) -> list[Tag]:
@@ -92,6 +107,7 @@ def get_source_record_id(cls, source_record: Tag) -> str:
         Args:
             source_record: A BeautifulSoup Tag representing a single XML record.
         """
+
         return str(source_record.header.find("identifier").string)
 
     @classmethod