Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace html5lib with html5lib-modern #2911

Merged
merged 3 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ build:
# the readthedocs environment.
- pip install -r devtools/requirements-poetry.in
post_install:
- poetry export --only=main --only=docs --extras=html -o requirements.txt
- poetry export --only=main --only=docs -o requirements.txt
- pip install --no-cache-dir -r requirements.txt
- pip install .
- python -c "from rdflib import Graph; print(Graph)"
Expand Down
5 changes: 1 addition & 4 deletions docker/latest/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile --config=pyproject.toml docker/latest/requirements.in
#
html5lib==1.1
html5lib-modern==1.2
# via -r docker/latest/requirements.in
isodate==0.6.1
# via rdflib
Expand All @@ -14,7 +14,4 @@ rdflib==7.0.0
# via -r docker/latest/requirements.in
six==1.16.0
# via
# html5lib
# isodate
webencodings==0.5.1
# via html5lib
39 changes: 11 additions & 28 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ isodate = "^0.6.0"
pyparsing = ">=2.1.0,<4"
berkeleydb = {version = "^18.1.0", optional = true}
networkx = {version = ">=2,<4", optional = true}
html5lib = {version = "^1.0", optional = true}
html5lib-modern = "^1.2"
lxml = {version = ">=4.3,<6.0", optional = true}
orjson = {version = ">=3.9.14,<4", optional = true}

Expand Down Expand Up @@ -73,7 +73,6 @@ ruff = ">=0.0.286,<0.7.0"
[tool.poetry.extras]
berkeleydb = ["berkeleydb"]
networkx = ["networkx"]
html = ["html5lib"]
lxml = ["lxml"]
orjson = ["orjson"]

Expand Down
34 changes: 10 additions & 24 deletions rdflib/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
from urllib.parse import urldefrag, urljoin, urlparse
from uuid import uuid4

import html5lib
from isodate import (
Duration,
duration_isoformat,
Expand All @@ -83,14 +84,6 @@
from .namespace import NamespaceManager
from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath

_HAS_HTML5LIB = False

try:
import html5lib

_HAS_HTML5LIB = True
except ImportError:
html5lib = None

_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"

Expand Down Expand Up @@ -1677,7 +1670,11 @@ def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
parser = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
)
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
try:
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
except html5lib.html5parser.ParseError as e:
logger.info(f"Failed to parse HTML: {e}")
raise e
result.normalize()
return result

Expand Down Expand Up @@ -2007,20 +2004,13 @@ def _castPythonToLiteral( # noqa: N802
(Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
(timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
(xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
(Fraction, (None, _OWL_RATIONAL)),
]

if html5lib is not None:
# This is a bit dirty, by accident the html5lib parser produces
# DocumentFragments, and the xml parser Documents, letting this
# decide what datatype to use makes roundtripping easier, but it a
# bit random.
#
# This must happen before _GenericPythonToXSDRules is assigned to
# _OriginalGenericPythonToXSDRules.
_GenericPythonToXSDRules.append(
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
)
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)),
(Fraction, (None, _OWL_RATIONAL)),
]

_OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)

Expand Down Expand Up @@ -2071,14 +2061,10 @@ def _castPythonToLiteral( # noqa: N802
URIRef(_XSD_PFX + "double"): float,
URIRef(_XSD_PFX + "base64Binary"): b64decode,
URIRef(_XSD_PFX + "anyURI"): None,
_RDF_HTMLLITERAL: _parse_html,
_RDF_XMLLITERAL: _parseXML,
}

if html5lib is not None:
# It is probably best to keep this close to the definition of
# _GenericPythonToXSDRules so nobody misses it.
XSDToPython[_RDF_HTMLLITERAL] = _parse_html

_check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,
Expand Down
7 changes: 1 addition & 6 deletions test/test_literal/test_literal_html5lib.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import xml.dom.minidom
from typing import Callable

import html5lib # noqa: F401
import pytest

import rdflib.term
Expand All @@ -9,14 +10,8 @@
from test.utils.literal import LiteralChecker
from test.utils.outcome import OutcomeChecker, OutcomePrimitives

try:
import html5lib as _ # noqa: F401
except ImportError:
pytest.skip("html5lib not installed", allow_module_level=True)


def test_has_html5lib() -> None:
assert rdflib.term._HAS_HTML5LIB is True
assert RDF.HTML in rdflib.term.XSDToPython
rule = next(
(
Expand Down
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ setenv =
COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
MYPY_CACHE_DIR = {envdir}/.mypy_cache
docs: POETRY_ARGS_docs = --only=docs
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson
lxml: POETRY_ARGS_lxml = --extras=lxml
commands_pre =
py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
Expand Down Expand Up @@ -59,7 +59,7 @@ setenv =
PYTHONHASHSEED = 0
commands_pre =
poetry lock --check
poetry install --only=main --only=docs --extras=html
poetry install --only=main --only=docs
poetry env info
commands =
poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html
Expand Down
Loading