RDFLib · ashleysommer · Oct 1, 2024 · Sep 25, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -21,7 +21,7 @@ build:
     # the readthedocs environment.
     - pip install -r devtools/requirements-poetry.in
     post_install:
-    - poetry export --only=main --only=docs --extras=html -o requirements.txt
+    - poetry export --only=main --only=docs -o requirements.txt
     - pip install --no-cache-dir -r requirements.txt
     - pip install .
     - python -c "from rdflib import Graph; print(Graph)"

diff --git a/docker/latest/requirements.txt b/docker/latest/requirements.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --config=pyproject.toml docker/latest/requirements.in
 #
-html5lib==1.1
+html5lib-modern==1.2
     # via -r docker/latest/requirements.in
 isodate==0.6.1
     # via rdflib
@@ -14,7 +14,4 @@ rdflib==7.0.0
     # via -r docker/latest/requirements.in
 six==1.16.0
     # via
-    #   html5lib
     #   isodate
-webencodings==0.5.1
-    # via html5lib
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,7 @@ isodate = "^0.6.0"
 pyparsing = ">=2.1.0,<4"
 berkeleydb = {version = "^18.1.0", optional = true}
 networkx = {version = ">=2,<4", optional = true}
-html5lib = {version = "^1.0", optional = true}
+html5lib-modern = "^1.2"
 lxml = {version = ">=4.3,<6.0", optional = true}
 orjson = {version = ">=3.9.14,<4", optional = true}
 
@@ -73,7 +73,6 @@ ruff = ">=0.0.286,<0.7.0"
 [tool.poetry.extras]
 berkeleydb = ["berkeleydb"]
 networkx = ["networkx"]
-html = ["html5lib"]
 lxml = ["lxml"]
 orjson = ["orjson"]
 

diff --git a/rdflib/term.py b/rdflib/term.py
@@ -66,6 +66,7 @@
 from urllib.parse import urldefrag, urljoin, urlparse
 from uuid import uuid4
 
+import html5lib
 from isodate import (
     Duration,
     duration_isoformat,
@@ -83,14 +84,6 @@
     from .namespace import NamespaceManager
     from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath
 
-_HAS_HTML5LIB = False
-
-try:
-    import html5lib
-
-    _HAS_HTML5LIB = True
-except ImportError:
-    html5lib = None
 
 _SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"
 
@@ -1677,7 +1670,11 @@ def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
     parser = html5lib.HTMLParser(
         tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
     )
-    result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
+    try:
+        result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
+    except html5lib.html5parser.ParseError as e:
+        logger.info(f"Failed to parse HTML: {e}")
+        raise e
     result.normalize()
     return result
 
@@ -2007,20 +2004,13 @@ def _castPythonToLiteral(  # noqa: N802
     (Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
     (timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
     (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
-    (Fraction, (None, _OWL_RATIONAL)),
-]
-
-if html5lib is not None:
     # This is a bit dirty, by accident the html5lib parser produces
     # DocumentFragments, and the xml parser Documents, letting this
     # decide what datatype to use makes roundtripping easier, but it a
     # bit random.
-    #
-    # This must happen before _GenericPythonToXSDRules is assigned to
-    # _OriginalGenericPythonToXSDRules.
-    _GenericPythonToXSDRules.append(
-        (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
-    )
+    (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)),
+    (Fraction, (None, _OWL_RATIONAL)),
+]
 
 _OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)
 
@@ -2071,14 +2061,10 @@ def _castPythonToLiteral(  # noqa: N802
     URIRef(_XSD_PFX + "double"): float,
     URIRef(_XSD_PFX + "base64Binary"): b64decode,
     URIRef(_XSD_PFX + "anyURI"): None,
+    _RDF_HTMLLITERAL: _parse_html,
     _RDF_XMLLITERAL: _parseXML,
 }
 
-if html5lib is not None:
-    # It is probably best to keep this close to the definition of
-    # _GenericPythonToXSDRules so nobody misses it.
-    XSDToPython[_RDF_HTMLLITERAL] = _parse_html
-
 _check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
     URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
     URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,

diff --git a/test/test_literal/test_literal_html5lib.py b/test/test_literal/test_literal_html5lib.py
@@ -1,6 +1,7 @@
 import xml.dom.minidom
 from typing import Callable
 
+import html5lib  # noqa: F401
 import pytest
 
 import rdflib.term
@@ -9,14 +10,8 @@
 from test.utils.literal import LiteralChecker
 from test.utils.outcome import OutcomeChecker, OutcomePrimitives
 
-try:
-    import html5lib as _  # noqa: F401
-except ImportError:
-    pytest.skip("html5lib not installed", allow_module_level=True)
-
 
 def test_has_html5lib() -> None:
-    assert rdflib.term._HAS_HTML5LIB is True
     assert RDF.HTML in rdflib.term.XSDToPython
     rule = next(
         (

diff --git a/tox.ini b/tox.ini
@@ -15,7 +15,7 @@ setenv =
     COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
     MYPY_CACHE_DIR = {envdir}/.mypy_cache
     docs: POETRY_ARGS_docs = --only=docs
-    extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson
+    extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson
     lxml: POETRY_ARGS_lxml = --extras=lxml
 commands_pre =
     py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
@@ -59,7 +59,7 @@ setenv =
     PYTHONHASHSEED = 0
 commands_pre =
     poetry lock --check
-    poetry install --only=main --only=docs --extras=html
+    poetry install --only=main --only=docs
     poetry env info
 commands =
     poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html