Skip to content

Commit

Permalink
SPIKE: benchmark lxml parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
ghukill committed Sep 3, 2024
1 parent 395e612 commit c3e3429
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 4 deletions.
3 changes: 2 additions & 1 deletion transmogrifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@
"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?"
"vid=01MIT_INST:MIT&docid=alma"
),
"transform-class": "transmogrifier.sources.xml.marc.Marc",
# "transform-class": "transmogrifier.sources.xml.marc.Marc",
"transform-class": "transmogrifier.sources.xml.marc_v2.MarcV2", # DEBUG: V2
},
"aspace": {
"name": "MIT ArchivesSpace",
Expand Down
8 changes: 8 additions & 0 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@

JSON: TypeAlias = dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None

# DEBUG: shim to use env var to get parsing approach
PARSER = os.getenv("PARSER", "bs4")


class Transformer(ABC):
"""Base transformer class."""
Expand Down Expand Up @@ -162,12 +165,15 @@ def _write_timdex_records_to_json_file(self, output_file: str) -> int:
Args:
output_file: The JSON file used for writing TIMDEX records.
"""
import time

count = 0
try:
record: timdex.TimdexRecord = next(self)
except StopIteration:
return count
with smart_open.open(output_file, "w") as file:
t0 = time.time()
file.write("[\n")
while record:
file.write(
Expand All @@ -185,6 +191,8 @@ def _write_timdex_records_to_json_file(self, output_file: str) -> int:
"Status update: %s records written to output file so far!",
count,
)
logger.info(f"Batch elapsed: {time.time()-t0}, parser: {PARSER}")
t0 = time.time()
try:
record: timdex.TimdexRecord = next(self) # type: ignore[no-redef]
except StopIteration:
Expand Down
130 changes: 130 additions & 0 deletions transmogrifier/sources/xml/marc_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import logging
import os

from bs4 import Tag # type: ignore[import-untyped]

from transmogrifier.sources.xmltransformer import XMLTransformer

logger = logging.getLogger(__name__)

# DEBUG: shim to use env var to get parsing approach
PARSER = os.getenv("PARSER", "bs4")

SIMULATE_NUM = 50


class MarcV2(XMLTransformer):
"""Marc transformer."""

# DEBUG: REQUIRED
@classmethod
def get_main_titles(cls, source_record: Tag) -> list[str]:
"""
Arbitrary field method to simulate lots of realistic data parsing via BS4 or lxml.
For each, a `for x in range(0, 50)` is added to simulate other field methods
parsing data from the record. This is probably a high number, maybe a typical
record only has 20-30 calls for data, but it exposes the difference between BS4
and lxml.
The final result is a technically valid record with a title pulled.
"""
if PARSER == "bs4":
subfield = None
for x in range(0, SIMULATE_NUM):
try:
element = source_record.find("datafield", tag="245")
for subfield in element.find_all(name=True, string=True):
if subfield.get("code", "") == "a":
break
except AttributeError:
logger.exception(
"Record ID %s is missing a 245 field",
cls.get_source_record_id(source_record),
)
return []

if subfield is not None:
return [str(subfield.string)]
else:
return []

elif PARSER == "lxml":

# DEBUG ##############################
# DEBUG: XPath: slow
# NOTE: this demonstrates using lxml.element.xpath, and is 7-8x times slower
# than using element.iter()
# DEBUG ##############################
# for x in range(0, SIMULATE_NUM):
# e = source_record.xpath("//datafield[@tag=245]/subfield[@code='a']")[0]
# return [e.text]

# DEBUG ##############################
# DEBUG: element.iter: fast
# DEBUG ##############################
# e = None
# for x in range(0, SIMULATE_NUM):
# for element in source_record.iter("datafield"):
# if element.attrib.get("tag") == "245":
# for subfield in element.iter("subfield"):
# if subfield.attrib.get("code") == "a":
# e = subfield
# break
# if e is not None:
# return [e.text]
# else:
# return []

# DEBUG ##############################
# DEBUG: find: slow (uses XPath)
# NOTE: this demonstrates using lxml.element.find, which is quite fast
# NOTE: while it looks like XPath, it's not the full implementation, so can
# be somewhat tricky to use; unsure what exactly is supported
# https://docs.python.org/2/library/xml.etree.elementtree.html#elementtree-xpath
# DEBUG ##############################
e = None
for x in range(0, SIMULATE_NUM):
e = source_record.find(".//datafield[@tag='245']/subfield[@code='a']")
if e is not None:
return [e.text]
else:
return []

@classmethod
def get_source_record_id(cls, source_record: Tag) -> str:
"""
Get the source record ID from a MARC XML record.
Overrides metaclass get_source_record_id() method.
Args:
source_record: A BeautifulSoup Tag representing a single MARC XML record.
"""

if PARSER == "bs4":
return str(source_record.find("controlfield", tag="001", string=True).string)

elif PARSER == "lxml":
return source_record.xpath("//controlfield[@tag=001]")[0].text

@classmethod
def record_is_deleted(cls, source_record: Tag) -> bool:
"""
Determine whether record has a status of deleted.
Overrides metaclass record_is_deleted() method.
Args:
source_record: A BeautifulSoup Tag representing a single MARC XML record
"""
if PARSER == "bs4":
if leader := source_record.find("leader", string=True): # noqa: SIM102
if leader.string[5:6] == "d":
return True
return False

elif PARSER == "lxml":
if source_record.xpath("//leader")[0].text[5:6] == "d":
return True
return False
22 changes: 19 additions & 3 deletions transmogrifier/sources/xmltransformer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from __future__ import annotations

import os
from typing import TYPE_CHECKING, final

import smart_open # type: ignore[import-untyped]
Expand All @@ -11,6 +11,9 @@
if TYPE_CHECKING:
from collections.abc import Iterator

# DEBUG: shim to use env var to get parsing approach
PARSER = os.getenv("PARSER", "bs4")


class XMLTransformer(Transformer):
"""XML transformer class."""
Expand All @@ -33,10 +36,22 @@ def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
encoding="utf-8",
recover=True,
):
record_string = etree.tostring(element, encoding="utf-8")
record = BeautifulSoup(record_string, "xml")
if PARSER == "bs4":
record_string = etree.tostring(element, encoding="utf-8")
record = BeautifulSoup(record_string, "xml")

elif PARSER == "lxml":
record = element

else:
raise Exception("parser type not recognized")

yield record
element.clear()
# DEBUG newly added ##################################
while element.getprevious() is not None:
del element.getparent()[0]
# DEBUG newly added ##################################

@classmethod
def get_main_titles(cls, _source_record: Tag) -> list[Tag]:
Expand Down Expand Up @@ -92,6 +107,7 @@ def get_source_record_id(cls, source_record: Tag) -> str:
Args:
source_record: A BeautifulSoup Tag representing a single XML record.
"""

return str(source_record.header.find("identifier").string)

@classmethod
Expand Down

0 comments on commit c3e3429

Please sign in to comment.