From d95f332828444792b91118492aa9f3f6b5dfed60 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 18 Aug 2024 21:57:25 +0100 Subject: [PATCH 01/92] perf: Replace `_use_referencing_library()` with a constant Every call was identical as it was based on an existing constant `jsonschema_version_str` --- tools/schemapi/schemapi.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index b6907ec8f..9d81ccd78 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -81,7 +81,9 @@ # class-level _class_is_valid_at_instantiation attribute to False DEBUG_MODE: bool = True -jsonschema_version_str = importlib_version("jsonschema") + +_USING_REFERENCING: Final[bool] = Version(importlib_version("jsonschema")) >= Version("4.18") # fmt: off +"""In version 4.18.0, the ``jsonschema`` package deprecated RefResolver in favor of the ``referencing`` library.""" def enable_debug_mode() -> None: @@ -191,7 +193,7 @@ def _get_errors_from_spec( if hasattr(validator_cls, "FORMAT_CHECKER"): validator_kwargs["format_checker"] = validator_cls.FORMAT_CHECKER - if _use_referencing_library(): + if _USING_REFERENCING: schema = _prepare_references_in_schema(schema) validator_kwargs["registry"] = _get_referencing_registry( rootschema or schema, json_schema_draft_url @@ -538,7 +540,7 @@ def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> dict[str, Any]: """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" - if _use_referencing_library(): + if _USING_REFERENCING: registry = _get_referencing_registry(rootschema or schema) # Using a different variable name to show that this is not the # jsonschema.RefResolver but instead a Resolver from the referencing From f4a4e0e4b0be55ade5e1602cfb65010767739ddf Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 18 Aug 2024 22:00:54 +0100 Subject: [PATCH 02/92] docs: Move `_get_errors_from_spec` comment into docstring I'm going to do this a lot. Docstrings can be collapsed in all editors and can benefit from markdown. Everything here is already private, so using long comments has no benefit --- tools/schemapi/schemapi.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 9d81ccd78..bbef37d99 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -171,20 +171,25 @@ def _get_errors_from_spec( rootschema: dict[str, Any] | None = None, ) -> ValidationErrorList: """ - Uses the relevant jsonschema validator to validate the passed in spec against the schema using the rootschema to resolve references. + Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. - The schema and rootschema themselves are not validated but instead considered as valid. - """ - # We don't use jsonschema.validate as this would validate the schema itself. - # Instead, we pass the schema directly to the validator class. This is done for - # two reasons: The schema comes from Vega-Lite and is not based on the user - # input, therefore there is no need to validate it in the first place. Furthermore, - # the "uri-reference" format checker fails for some of the references as URIs in - # "$ref" are not encoded, - # e.g. '#/definitions/ValueDefWithCondition' would be a valid $ref in a Vega-Lite schema but - # it is not a valid URI reference due to the characters such as '<'. + ``schema`` and ``rootschema`` are not validated but instead considered as valid. + + We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. + Instead, we pass the ``schema`` directly to the validator class. + + This is done for two reasons: + 1. The schema comes from Vega-Lite and is not based on the user + input, therefore there is no need to validate it in the first place. + 2. The "uri-reference" format checker fails for some of the + references as URIs in "$ref" are not encoded, e.g.: + + '#/definitions/ValueDefWithCondition' + + would be a valid $ref in a Vega-Lite schema but it is not a valid + URI reference due to the characters such as '<'. + """ json_schema_draft_url = _get_json_schema_draft_url(rootschema or schema) validator_cls = jsonschema.validators.validator_for( {"$schema": json_schema_draft_url} From 6dfe61bf8bc6404963813230a01b05f950cbfa13 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 18 Aug 2024 22:02:51 +0100 Subject: [PATCH 03/92] fix(typing): Resolve `jsonschema` incomplete stubs issue `typeshed` disagrees with `jsonschema`, this is just enforcing what `jsonschema` says is true --- tools/schemapi/schemapi.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index bbef37d99..1d734ddc1 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -42,6 +42,7 @@ if TYPE_CHECKING: from typing import ClassVar + from jsonschema.protocols import Validator from referencing import Registry from altair.typing import ChartType @@ -191,8 +192,9 @@ def _get_errors_from_spec( URI reference due to the characters such as '<'. """ json_schema_draft_url = _get_json_schema_draft_url(rootschema or schema) - validator_cls = jsonschema.validators.validator_for( - {"$schema": json_schema_draft_url} + validator_cls: type[Validator] = cast( + "type[Validator]", + jsonschema.validators.validator_for({"$schema": json_schema_draft_url}), ) validator_kwargs: dict[str, Any] = {} if hasattr(validator_cls, "FORMAT_CHECKER"): From e9a4beb02034c0730e6737db37aa1758b2233f9e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 18 Aug 2024 22:03:58 +0100 Subject: [PATCH 04/92] refactor: Reuse `None` in ternary expression --- tools/schemapi/schemapi.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 1d734ddc1..b1f3192ab 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -205,13 +205,10 @@ def _get_errors_from_spec( validator_kwargs["registry"] = _get_referencing_registry( rootschema or schema, json_schema_draft_url ) - else: # No resolver is necessary if the schema is already the full schema validator_kwargs["resolver"] = ( - jsonschema.RefResolver.from_schema(rootschema) - if rootschema is not None - else None + jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema ) validator = validator_cls(schema, **validator_kwargs) From c53487645685e9e377fb53d99e94c58a9af37471 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 18 Aug 2024 22:07:46 +0100 Subject: [PATCH 05/92] perf: Replace `_prepare_references_in_schema` Produces the same result, but skips the upfront `deepcopy`. No longer modifying the copy inplace, new objects are created inside the iterator. --- tools/schemapi/schemapi.py | 60 ++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index b1f3192ab..e528dcacd 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1,7 +1,6 @@ from __future__ import annotations import contextlib -import copy import inspect import json import sys @@ -201,7 +200,7 @@ def _get_errors_from_spec( validator_kwargs["format_checker"] = validator_cls.FORMAT_CHECKER if _USING_REFERENCING: - schema = _prepare_references_in_schema(schema) + schema = _prepare_references(schema) validator_kwargs["registry"] = _get_referencing_registry( rootschema or schema, json_schema_draft_url ) @@ -220,44 +219,35 @@ def _get_json_schema_draft_url(schema: dict[str, Any]) -> str: return schema.get("$schema", _DEFAULT_JSON_SCHEMA_DRAFT_URL) -def _use_referencing_library() -> bool: - """In version 4.18.0, the jsonschema package deprecated RefResolver in favor of the referencing library.""" - return Version(jsonschema_version_str) >= Version("4.18") +def _prepare_references(schema: dict[str, Any], /) -> dict[str, Any]: + """ + Return a deep copy of ``schema`` w/ replaced uri(s). + All encountered ``dict | list``(s) will be reconstructed + w/ ``_VEGA_LITE_ROOT_URI`` in front of all nested``$ref`` values. -def _prepare_references_in_schema(schema: dict[str, Any]) -> dict[str, Any]: - # Create a copy so that $ref is not modified in the original schema in case - # that it would still reference a dictionary which might be attached to - # an Altair class _schema attribute - schema = copy.deepcopy(schema) + Notes + ----- + ``copy.deepcopy`` is not needed as the iterator yields new objects. + """ + return dict(_rec_refs(schema)) - def _prepare_refs(d: dict[str, Any]) -> dict[str, Any]: - """ - Add _VEGA_LITE_ROOT_URI in front of all $ref values. - This function recursively iterates through the whole dictionary. +def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: + """ + Recurse through a schema, yielding fresh copies of mutable containers. - $ref values can only be nested in dictionaries or lists - as the passed in `d` dictionary comes from the Vega-Lite json schema - and in json we only have arrays (-> lists in Python) and objects - (-> dictionaries in Python) which we need to iterate through. - """ - for key, value in d.items(): - if key == "$ref": - d[key] = _VEGA_LITE_ROOT_URI + d[key] - elif isinstance(value, dict): - d[key] = _prepare_refs(value) - elif isinstance(value, list): - prepared_values = [] - for v in value: - if isinstance(v, dict): - v = _prepare_refs(v) - prepared_values.append(v) - d[key] = prepared_values - return d - - schema = _prepare_refs(schema) - return schema + Adds ``_VEGA_LITE_ROOT_URI`` in front of all nested``$ref`` values. + """ + for k, v in m.items(): + if k == "$ref": + yield k, f"{_VEGA_LITE_ROOT_URI}{v}" + elif isinstance(v, dict): + yield k, dict(_rec_refs(v)) + elif isinstance(v, list): + yield k, [dict(_rec_refs(el)) if _is_dict(el) else el for el in v] + else: + yield k, v # We do not annotate the return value here as the referencing library is not always From e30cac6db326115a3c5d21d14882454e883c9eef Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 18 Aug 2024 22:10:18 +0100 Subject: [PATCH 06/92] chore: Add note on `_get_errors_from_spec` --- tools/schemapi/schemapi.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index e528dcacd..4ccea8d73 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -165,6 +165,9 @@ def validate_jsonschema( return None +# NOTE: Entry for creating a `list` of errors +# Everything else is skipped if this returns an empty `list` +# TODO: Refactor to peek at possible error w/ `next(validator.iter_errors(spec))` def _get_errors_from_spec( spec: dict[str, Any], schema: dict[str, Any], From e702b2648476b75056c3eb18da10d6ffed990203 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 18 Aug 2024 22:11:40 +0100 Subject: [PATCH 07/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 111 +++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 56 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index b91b90fbe..a0a2535f5 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -3,7 +3,6 @@ from __future__ import annotations import contextlib -import copy import inspect import json import sys @@ -44,6 +43,7 @@ if TYPE_CHECKING: from typing import ClassVar + from jsonschema.protocols import Validator from referencing import Registry from altair.typing import ChartType @@ -83,7 +83,9 @@ # class-level _class_is_valid_at_instantiation attribute to False DEBUG_MODE: bool = True -jsonschema_version_str = importlib_version("jsonschema") + +_USING_REFERENCING: Final[bool] = Version(importlib_version("jsonschema")) >= Version("4.18") # fmt: off +"""In version 4.18.0, the ``jsonschema`` package deprecated RefResolver in favor of the ``referencing`` library.""" def enable_debug_mode() -> None: @@ -165,46 +167,52 @@ def validate_jsonschema( return None +# NOTE: Entry for creating a `list` of errors +# Everything else is skipped if this returns an empty `list` +# TODO: Refactor to peek at possible error w/ `next(validator.iter_errors(spec))` def _get_errors_from_spec( spec: dict[str, Any], schema: dict[str, Any], rootschema: dict[str, Any] | None = None, ) -> ValidationErrorList: """ - Uses the relevant jsonschema validator to validate the passed in spec against the schema using the rootschema to resolve references. + Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. - The schema and rootschema themselves are not validated but instead considered as valid. - """ - # We don't use jsonschema.validate as this would validate the schema itself. - # Instead, we pass the schema directly to the validator class. This is done for - # two reasons: The schema comes from Vega-Lite and is not based on the user - # input, therefore there is no need to validate it in the first place. Furthermore, - # the "uri-reference" format checker fails for some of the references as URIs in - # "$ref" are not encoded, - # e.g. '#/definitions/ValueDefWithCondition' would be a valid $ref in a Vega-Lite schema but - # it is not a valid URI reference due to the characters such as '<'. + ``schema`` and ``rootschema`` are not validated but instead considered as valid. + + We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. + Instead, we pass the ``schema`` directly to the validator class. + + This is done for two reasons: + + 1. The schema comes from Vega-Lite and is not based on the user + input, therefore there is no need to validate it in the first place. + 2. The "uri-reference" format checker fails for some of the + references as URIs in "$ref" are not encoded, e.g.: + '#/definitions/ValueDefWithCondition' + + would be a valid $ref in a Vega-Lite schema but it is not a valid + URI reference due to the characters such as '<'. + """ json_schema_draft_url = _get_json_schema_draft_url(rootschema or schema) - validator_cls = jsonschema.validators.validator_for( - {"$schema": json_schema_draft_url} + validator_cls: type[Validator] = cast( + "type[Validator]", + jsonschema.validators.validator_for({"$schema": json_schema_draft_url}), ) validator_kwargs: dict[str, Any] = {} if hasattr(validator_cls, "FORMAT_CHECKER"): validator_kwargs["format_checker"] = validator_cls.FORMAT_CHECKER - if _use_referencing_library(): - schema = _prepare_references_in_schema(schema) + if _USING_REFERENCING: + schema = _prepare_references(schema) validator_kwargs["registry"] = _get_referencing_registry( rootschema or schema, json_schema_draft_url ) - else: # No resolver is necessary if the schema is already the full schema validator_kwargs["resolver"] = ( - jsonschema.RefResolver.from_schema(rootschema) - if rootschema is not None - else None + jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema ) validator = validator_cls(schema, **validator_kwargs) @@ -216,44 +224,35 @@ def _get_json_schema_draft_url(schema: dict[str, Any]) -> str: return schema.get("$schema", _DEFAULT_JSON_SCHEMA_DRAFT_URL) -def _use_referencing_library() -> bool: - """In version 4.18.0, the jsonschema package deprecated RefResolver in favor of the referencing library.""" - return Version(jsonschema_version_str) >= Version("4.18") +def _prepare_references(schema: dict[str, Any], /) -> dict[str, Any]: + """ + Return a deep copy of ``schema`` w/ replaced uri(s). + All encountered ``dict | list``(s) will be reconstructed + w/ ``_VEGA_LITE_ROOT_URI`` in front of all nested``$ref`` values. -def _prepare_references_in_schema(schema: dict[str, Any]) -> dict[str, Any]: - # Create a copy so that $ref is not modified in the original schema in case - # that it would still reference a dictionary which might be attached to - # an Altair class _schema attribute - schema = copy.deepcopy(schema) + Notes + ----- + ``copy.deepcopy`` is not needed as the iterator yields new objects. + """ + return dict(_rec_refs(schema)) - def _prepare_refs(d: dict[str, Any]) -> dict[str, Any]: - """ - Add _VEGA_LITE_ROOT_URI in front of all $ref values. - This function recursively iterates through the whole dictionary. +def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: + """ + Recurse through a schema, yielding fresh copies of mutable containers. - $ref values can only be nested in dictionaries or lists - as the passed in `d` dictionary comes from the Vega-Lite json schema - and in json we only have arrays (-> lists in Python) and objects - (-> dictionaries in Python) which we need to iterate through. - """ - for key, value in d.items(): - if key == "$ref": - d[key] = _VEGA_LITE_ROOT_URI + d[key] - elif isinstance(value, dict): - d[key] = _prepare_refs(value) - elif isinstance(value, list): - prepared_values = [] - for v in value: - if isinstance(v, dict): - v = _prepare_refs(v) - prepared_values.append(v) - d[key] = prepared_values - return d - - schema = _prepare_refs(schema) - return schema + Adds ``_VEGA_LITE_ROOT_URI`` in front of all nested``$ref`` values. + """ + for k, v in m.items(): + if k == "$ref": + yield k, f"{_VEGA_LITE_ROOT_URI}{v}" + elif isinstance(v, dict): + yield k, dict(_rec_refs(v)) + elif isinstance(v, list): + yield k, [dict(_rec_refs(el)) if _is_dict(el) else el for el in v] + else: + yield k, v # We do not annotate the return value here as the referencing library is not always @@ -540,7 +539,7 @@ def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> dict[str, Any]: """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" - if _use_referencing_library(): + if _USING_REFERENCING: registry = _get_referencing_registry(rootschema or schema) # Using a different variable name to show that this is not the # jsonschema.RefResolver but instead a Resolver from the referencing From af878d08e8458cbf4e299b7186d327eb291c9f1b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:29:36 +0100 Subject: [PATCH 08/92] refactor(typing): Use stubs type `_JsonParameter` https://github.com/python/typeshed/blob/937270df0c25dc56a02f7199f1943fdb7d47aa9d/stubs/jsonschema/jsonschema/protocols.pyi#L11 --- tools/schemapi/schemapi.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 4ccea8d73..2df9070d1 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -41,7 +41,7 @@ if TYPE_CHECKING: from typing import ClassVar - from jsonschema.protocols import Validator + from jsonschema.protocols import Validator, _JsonParameter from referencing import Registry from altair.typing import ChartType @@ -109,7 +109,7 @@ def debug_mode(arg: bool) -> Iterator[None]: @overload def validate_jsonschema( - spec: Any, + spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = ..., *, @@ -119,7 +119,7 @@ def validate_jsonschema( @overload def validate_jsonschema( - spec: Any, + spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = ..., *, @@ -128,7 +128,7 @@ def validate_jsonschema( def validate_jsonschema( - spec, + spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, *, @@ -169,7 +169,7 @@ def validate_jsonschema( # Everything else is skipped if this returns an empty `list` # TODO: Refactor to peek at possible error w/ `next(validator.iter_errors(spec))` def _get_errors_from_spec( - spec: dict[str, Any], + spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, ) -> ValidationErrorList: From 5ca44df69a970bc21f41303f758cc84e68ab9e72 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:32:34 +0100 Subject: [PATCH 09/92] refactor: Shorten some references to `ValidationError` --- tools/schemapi/schemapi.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 2df9070d1..271e7070d 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -41,6 +41,7 @@ if TYPE_CHECKING: from typing import ClassVar + from jsonschema import ValidationError from jsonschema.protocols import Validator, _JsonParameter from referencing import Registry @@ -56,7 +57,7 @@ else: from typing_extensions import Never, Self -ValidationErrorList: TypeAlias = List[jsonschema.exceptions.ValidationError] +ValidationErrorList: TypeAlias = List[jsonschema.ValidationError] GroupedValidationErrors: TypeAlias = Dict[str, ValidationErrorList] # This URI is arbitrary and could be anything else. It just cannot be an empty @@ -124,7 +125,7 @@ def validate_jsonschema( rootschema: dict[str, Any] | None = ..., *, raise_error: Literal[False], -) -> jsonschema.exceptions.ValidationError | None: ... +) -> ValidationError | None: ... def validate_jsonschema( @@ -133,7 +134,7 @@ def validate_jsonschema( rootschema: dict[str, Any] | None = None, *, raise_error: bool = True, -) -> jsonschema.exceptions.ValidationError | None: +) -> ValidationError | None: """ Validates the passed in spec against the schema in the context of the rootschema. @@ -398,7 +399,7 @@ def _deduplicate_errors( return grouped_errors_deduplicated -def _is_required_value_error(err: jsonschema.exceptions.ValidationError) -> bool: +def _is_required_value_error(err: ValidationError) -> bool: return err.validator == "required" and err.validator_value == ["value"] @@ -558,7 +559,7 @@ def _resolve_references( class SchemaValidationError(jsonschema.ValidationError): """A wrapper for jsonschema.ValidationError with friendlier traceback.""" - def __init__(self, obj: SchemaBase, err: jsonschema.ValidationError) -> None: + def __init__(self, obj: SchemaBase, err: ValidationError) -> None: super().__init__(**err._contents()) self.obj = obj self._errors: GroupedValidationErrors = getattr( @@ -614,7 +615,7 @@ def _get_message_for_errors_group( def _get_additional_properties_error_message( self, - error: jsonschema.exceptions.ValidationError, + error: ValidationError, ) -> str: """Output all existing parameters when an unknown parameter is specified.""" altair_cls = self._get_altair_class_for_error(error) @@ -633,9 +634,7 @@ def _get_additional_properties_error_message( See the help for `{altair_cls.__name__}` to read the full description of these parameters""" return message - def _get_altair_class_for_error( - self, error: jsonschema.exceptions.ValidationError - ) -> type[SchemaBase]: + def _get_altair_class_for_error(self, error: ValidationError) -> type[SchemaBase]: """ Try to get the lowest class possible in the chart hierarchy so it can be displayed in the error message. From de0701226515c42d13e4b0f46e24391110e99e37 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:51:45 +0100 Subject: [PATCH 10/92] perf: Redefine `_json_path` to be bound on `jsonschema` version Previously, using a version below `4.0.1` would still always check first if there was a property. This would not change between checks. Defining in this style removes the need for as much documentation, since the version guards are very clear when each branch is used. --- tools/schemapi/schemapi.py | 52 +++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 271e7070d..e7b37b25e 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -3,6 +3,7 @@ import contextlib import inspect import json +import operator import sys import textwrap from collections import defaultdict @@ -13,6 +14,7 @@ from typing import ( TYPE_CHECKING, Any, + Callable, Dict, Final, Iterable, @@ -28,7 +30,6 @@ from typing_extensions import TypeAlias import jsonschema -import jsonschema.exceptions import jsonschema.validators import narwhals.stable.v1 as nw from packaging.version import Version @@ -82,9 +83,31 @@ # class-level _class_is_valid_at_instantiation attribute to False DEBUG_MODE: bool = True +_JSONSCHEMA_VERSION = Version(importlib_version("jsonschema")) +_USING_REFERENCING: Final[bool] = _JSONSCHEMA_VERSION >= Version("4.18") # noqa: SIM300 +""" +``jsonschema`` deprecated ``RefResolver`` in favor of ``referencing``. + +See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 +""" + +if _JSONSCHEMA_VERSION >= Version("4.0.1"): # noqa: SIM300 + _json_path: Callable[[ValidationError], str] = operator.attrgetter("json_path") +else: -_USING_REFERENCING: Final[bool] = Version(importlib_version("jsonschema")) >= Version("4.18") # fmt: off -"""In version 4.18.0, the ``jsonschema`` package deprecated RefResolver in favor of the ``referencing`` library.""" + def _json_path(err: ValidationError, /) -> str: + """ + Vendored backport for ``jsonschema.ValidationError.json_path`` property. + + See https://github.com/vega/altair/issues/3038. + """ + path = "$" + for elem in err.absolute_path: + if isinstance(elem, int): + path += "[" + str(elem) + "]" + else: + path += "." + elem + return path def enable_debug_mode() -> None: @@ -279,23 +302,7 @@ def _get_referencing_registry( ) -def _json_path(err: jsonschema.exceptions.ValidationError) -> str: - """ - Drop in replacement for the .json_path property of the jsonschema ValidationError class. - - This is not available as property for ValidationError with jsonschema<4.0.1. - - More info, see https://github.com/vega/altair/issues/3038. - """ - path = "$" - for elem in err.absolute_path: - if isinstance(elem, int): - path += "[" + str(elem) + "]" - else: - path += "." + elem - return path - - +# NOTE: Review function (2) def _group_errors_by_json_path( errors: ValidationErrorList, ) -> GroupedValidationErrors: @@ -308,8 +315,7 @@ def _group_errors_by_json_path( """ errors_by_json_path = defaultdict(list) for err in errors: - err_key = getattr(err, "json_path", _json_path(err)) - errors_by_json_path[err_key].append(err) + errors_by_json_path[_json_path(err)].append(err) return dict(errors_by_json_path) @@ -563,7 +569,7 @@ def __init__(self, obj: SchemaBase, err: ValidationError) -> None: super().__init__(**err._contents()) self.obj = obj self._errors: GroupedValidationErrors = getattr( - err, "_all_errors", {getattr(err, "json_path", _json_path(err)): [err]} + err, "_all_errors", {_json_path(err): [err]} ) # This is the message from err self._original_message = self.message From 05976a6a03cb5ef4a9850083b8fb11bc46b7ff27 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:56:43 +0100 Subject: [PATCH 11/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 79 +++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index a0a2535f5..e8bf16f80 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -5,6 +5,7 @@ import contextlib import inspect import json +import operator import sys import textwrap from collections import defaultdict @@ -15,6 +16,7 @@ from typing import ( TYPE_CHECKING, Any, + Callable, Dict, Final, Iterable, @@ -30,7 +32,6 @@ from typing_extensions import TypeAlias import jsonschema -import jsonschema.exceptions import jsonschema.validators import narwhals.stable.v1 as nw from packaging.version import Version @@ -43,7 +44,8 @@ if TYPE_CHECKING: from typing import ClassVar - from jsonschema.protocols import Validator + from jsonschema import ValidationError + from jsonschema.protocols import Validator, _JsonParameter from referencing import Registry from altair.typing import ChartType @@ -58,7 +60,7 @@ else: from typing_extensions import Never, Self -ValidationErrorList: TypeAlias = List[jsonschema.exceptions.ValidationError] +ValidationErrorList: TypeAlias = List[jsonschema.ValidationError] GroupedValidationErrors: TypeAlias = Dict[str, ValidationErrorList] # This URI is arbitrary and could be anything else. It just cannot be an empty @@ -83,9 +85,31 @@ # class-level _class_is_valid_at_instantiation attribute to False DEBUG_MODE: bool = True +_JSONSCHEMA_VERSION = Version(importlib_version("jsonschema")) +_USING_REFERENCING: Final[bool] = _JSONSCHEMA_VERSION >= Version("4.18") # noqa: SIM300 +""" +``jsonschema`` deprecated ``RefResolver`` in favor of ``referencing``. + +See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 +""" + +if _JSONSCHEMA_VERSION >= Version("4.0.1"): # noqa: SIM300 + _json_path: Callable[[ValidationError], str] = operator.attrgetter("json_path") +else: -_USING_REFERENCING: Final[bool] = Version(importlib_version("jsonschema")) >= Version("4.18") # fmt: off -"""In version 4.18.0, the ``jsonschema`` package deprecated RefResolver in favor of the ``referencing`` library.""" + def _json_path(err: ValidationError, /) -> str: + """ + Vendored backport for ``jsonschema.ValidationError.json_path`` property. + + See https://github.com/vega/altair/issues/3038. + """ + path = "$" + for elem in err.absolute_path: + if isinstance(elem, int): + path += "[" + str(elem) + "]" + else: + path += "." + elem + return path def enable_debug_mode() -> None: @@ -111,7 +135,7 @@ def debug_mode(arg: bool) -> Iterator[None]: @overload def validate_jsonschema( - spec: Any, + spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = ..., *, @@ -121,21 +145,21 @@ def validate_jsonschema( @overload def validate_jsonschema( - spec: Any, + spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = ..., *, raise_error: Literal[False], -) -> jsonschema.exceptions.ValidationError | None: ... +) -> ValidationError | None: ... def validate_jsonschema( - spec, + spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, *, raise_error: bool = True, -) -> jsonschema.exceptions.ValidationError | None: +) -> ValidationError | None: """ Validates the passed in spec against the schema in the context of the rootschema. @@ -171,7 +195,7 @@ def validate_jsonschema( # Everything else is skipped if this returns an empty `list` # TODO: Refactor to peek at possible error w/ `next(validator.iter_errors(spec))` def _get_errors_from_spec( - spec: dict[str, Any], + spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, ) -> ValidationErrorList: @@ -280,23 +304,7 @@ def _get_referencing_registry( ) -def _json_path(err: jsonschema.exceptions.ValidationError) -> str: - """ - Drop in replacement for the .json_path property of the jsonschema ValidationError class. - - This is not available as property for ValidationError with jsonschema<4.0.1. - - More info, see https://github.com/vega/altair/issues/3038. - """ - path = "$" - for elem in err.absolute_path: - if isinstance(elem, int): - path += "[" + str(elem) + "]" - else: - path += "." + elem - return path - - +# NOTE: Review function (2) def _group_errors_by_json_path( errors: ValidationErrorList, ) -> GroupedValidationErrors: @@ -309,8 +317,7 @@ def _group_errors_by_json_path( """ errors_by_json_path = defaultdict(list) for err in errors: - err_key = getattr(err, "json_path", _json_path(err)) - errors_by_json_path[err_key].append(err) + errors_by_json_path[_json_path(err)].append(err) return dict(errors_by_json_path) @@ -400,7 +407,7 @@ def _deduplicate_errors( return grouped_errors_deduplicated -def _is_required_value_error(err: jsonschema.exceptions.ValidationError) -> bool: +def _is_required_value_error(err: ValidationError) -> bool: return err.validator == "required" and err.validator_value == ["value"] @@ -560,11 +567,11 @@ def _resolve_references( class SchemaValidationError(jsonschema.ValidationError): """A wrapper for jsonschema.ValidationError with friendlier traceback.""" - def __init__(self, obj: SchemaBase, err: jsonschema.ValidationError) -> None: + def __init__(self, obj: SchemaBase, err: ValidationError) -> None: super().__init__(**err._contents()) self.obj = obj self._errors: GroupedValidationErrors = getattr( - err, "_all_errors", {getattr(err, "json_path", _json_path(err)): [err]} + err, "_all_errors", {_json_path(err): [err]} ) # This is the message from err self._original_message = self.message @@ -616,7 +623,7 @@ def _get_message_for_errors_group( def _get_additional_properties_error_message( self, - error: jsonschema.exceptions.ValidationError, + error: ValidationError, ) -> str: """Output all existing parameters when an unknown parameter is specified.""" altair_cls = self._get_altair_class_for_error(error) @@ -635,9 +642,7 @@ def _get_additional_properties_error_message( See the help for `{altair_cls.__name__}` to read the full description of these parameters""" return message - def _get_altair_class_for_error( - self, error: jsonschema.exceptions.ValidationError - ) -> type[SchemaBase]: + def _get_altair_class_for_error(self, error: ValidationError) -> type[SchemaBase]: """ Try to get the lowest class possible in the chart hierarchy so it can be displayed in the error message. From af6e41ce0c8476b28f329528d1442f1ba44751e0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 19 Aug 2024 23:19:50 +0100 Subject: [PATCH 12/92] perf(DRAFT): Initial lazy validation First non-failing version. Have left most of the original code in. Planning to migrate & adapt the comments before removing. # --- altair/utils/schemapi.py | 229 ++++++++++++++++++++++++++++++------- tools/schemapi/schemapi.py | 229 ++++++++++++++++++++++++++++++------- 2 files changed, 380 insertions(+), 78 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index e8bf16f80..93cbb06c1 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -11,7 +11,7 @@ from collections import defaultdict from functools import partial from importlib.metadata import version as importlib_version -from itertools import chain, zip_longest +from itertools import chain, groupby, islice, zip_longest from math import ceil from typing import ( TYPE_CHECKING, @@ -34,6 +34,7 @@ import jsonschema import jsonschema.validators import narwhals.stable.v1 as nw +from jsonschema import ValidationError from packaging.version import Version # This leads to circular imports with the vegalite module. Currently, this works @@ -44,7 +45,6 @@ if TYPE_CHECKING: from typing import ClassVar - from jsonschema import ValidationError from jsonschema.protocols import Validator, _JsonParameter from referencing import Registry @@ -63,27 +63,40 @@ ValidationErrorList: TypeAlias = List[jsonschema.ValidationError] GroupedValidationErrors: TypeAlias = Dict[str, ValidationErrorList] -# This URI is arbitrary and could be anything else. It just cannot be an empty -# string as we need to reference the schema registered in -# the referencing.Registry. _VEGA_LITE_ROOT_URI: Final = "urn:vega-lite-schema" +""" +Prefix added to each ``"$ref"``. + +This URI is arbitrary and could be anything else. + +It just cannot be an empty string as we need to reference the schema registered in +the ``referencing.Registry``.""" -# Ideally, jsonschema specification would be parsed from the current Vega-Lite -# schema instead of being hardcoded here as a default value. -# However, due to circular imports between this module and the altair.vegalite -# modules, this information is not yet available at this point as altair.vegalite -# is only partially loaded. The draft version which is used is unlikely to -# change often so it's ok to keep this. There is also a test which validates -# that this value is always the same as in the Vega-Lite schema. _DEFAULT_JSON_SCHEMA_DRAFT_URL: Final = "http://json-schema.org/draft-07/schema#" +""" +Ideally, jsonschema specification would be parsed from the current Vega-Lite +schema instead of being hardcoded here as a default value. +However, due to circular imports between this module and the ``alt.vegalite`` +modules, this information is not yet available at this point as ``alt.vegalite`` +is only partially loaded. + +The draft version which is used is unlikely to change often so it's ok to keep this. +There is also a test which validates that this value is always the same as in the Vega-Lite schema. +""" -# If DEBUG_MODE is True, then schema objects are converted to dict and -# validated at creation time. This slows things down, particularly for -# larger specs, but leads to much more useful tracebacks for the user. -# Individual schema classes can override this by setting the -# class-level _class_is_valid_at_instantiation attribute to False DEBUG_MODE: bool = True +""" +If ``DEBUG_MODE``, then ``SchemaBase`` are converted to ``dict`` and validated at creation time. + +This slows things down, particularly for larger specs, but leads to much more +useful tracebacks for the user. + +Individual schema classes can override with: + + class Derived(SchemaBase): + _class_is_valid_at_instantiation: ClassVar[bool] = False +""" _JSONSCHEMA_VERSION = Version(importlib_version("jsonschema")) _USING_REFERENCING: Final[bool] = _JSONSCHEMA_VERSION >= Version("4.18") # noqa: SIM300 @@ -141,8 +154,6 @@ def validate_jsonschema( *, raise_error: Literal[True] = ..., ) -> Never: ... - - @overload def validate_jsonschema( spec: _JsonParameter, @@ -151,8 +162,6 @@ def validate_jsonschema( *, raise_error: Literal[False], ) -> ValidationError | None: ... - - def validate_jsonschema( spec: _JsonParameter, schema: dict[str, Any], @@ -167,8 +176,9 @@ def validate_jsonschema( and only the most relevant errors are kept. Errors are then either raised or returned, depending on the value of `raise_error`. """ - errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) - if errors: + it_errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) + if first_error := next(it_errors, None): + errors = [first_error, *it_errors] leaf_errors = _get_leaves_of_error_tree(errors) grouped_errors = _group_errors_by_json_path(leaf_errors) grouped_errors = _subset_to_most_specific_json_paths(grouped_errors) @@ -182,7 +192,7 @@ def validate_jsonschema( # error message. Setting a new attribute like this is not ideal as # it then no longer matches the type ValidationError. It would be better # to refactor this function to never raise but only return errors. - main_error._all_errors = grouped_errors + main_error._errors = list(grouped_errors.values()) if raise_error: raise main_error else: @@ -191,14 +201,50 @@ def validate_jsonschema( return None -# NOTE: Entry for creating a `list` of errors -# Everything else is skipped if this returns an empty `list` -# TODO: Refactor to peek at possible error w/ `next(validator.iter_errors(spec))` +def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: + """ + Continue an iterator at the last popped ``element``. + + Equivalent to:: + + elements = 1, 2, 3, 4, 5 + it = iter(elements) + element = next(it) + it_continue = chain([element], it) + + """ + yield element + yield from others + + +def lazy_validate_json_schema( + spec: _JsonParameter, + schema: dict[str, Any], + rootschema: dict[str, Any] | None = None, +) -> None: + """Lazy equivalent of `validate_jsonschema`.""" + it_errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) + if first_error := next(it_errors, None): + groups = _lazy_group_tree_leaves(_rechain(first_error, it_errors)) + most_specific = _lazy_subset_to_most_specific_json_paths(groups) + deduplicated = _lazy_deduplicate_errors(most_specific) + dummy_error: Any + if dummy_error := next(deduplicated, None): + dummy_error._errors = _rechain(dummy_error, deduplicated) # type: ignore[attr-defined] + raise dummy_error + else: + msg = ( + f"Expected to find at least one error, but first error was `None`.\n\n" + f"spec: {spec!r}" + ) + raise NotImplementedError(msg) + + def _get_errors_from_spec( spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, -) -> ValidationErrorList: +) -> Iterator[ValidationError]: """ Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. @@ -240,8 +286,7 @@ def _get_errors_from_spec( ) validator = validator_cls(schema, **validator_kwargs) - errors = list(validator.iter_errors(spec)) - return errors + return validator.iter_errors(spec) def _get_json_schema_draft_url(schema: dict[str, Any]) -> str: @@ -304,9 +349,8 @@ def _get_referencing_registry( ) -# NOTE: Review function (2) def _group_errors_by_json_path( - errors: ValidationErrorList, + errors: Iterable[ValidationError], ) -> GroupedValidationErrors: """ Groups errors by the `json_path` attribute of the jsonschema ValidationError class. @@ -342,6 +386,109 @@ def _get_leaves_of_error_tree( return leaves +def _lazy_group_tree_leaves( + errors: Iterable[ValidationError], / +) -> Iterator[tuple[str, ValidationError]]: + """ + Combines 3 previously distinct steps: + + - ``_get_leaves_of_error_tree`` + - (part of) ``_group_errors_by_json_path`` + - Doesnt actually group yet, can by calling `dict(result)`. + - ``_is_required_value_error`` + """ # noqa: D400 + for err in errors: + if err_context := err.context: + yield from _lazy_group_tree_leaves(err_context) + elif err.validator == "required" and err.validator_value == ["value"]: + continue + else: + yield _json_path(err), err + + +_fn_path = cast("Callable[[tuple[str, ValidationError]], str]", operator.itemgetter(0)) +"""Key function for ``(json_path, ValidationError)``.""" +_fn_validator = cast("Callable[[ValidationError], str]", operator.attrgetter("validator")) # fmt: off +"""Key function for ``ValidationError.validator``.""" + + +def _lazy_subset_to_most_specific_json_paths( + json_path_errors: Iterator[tuple[str, ValidationError]], / +) -> Iterator[Iterable[ValidationError]]: + """ + Currently using a `list`, but typing it more restrictive to see if it can be avoided. + + - Needs to be sorted to work with groupby + - Reversing allows prioritising more specific groups, since they are seen first + - Then re-reversed, to keep seen order + + """ + rev_sort = sorted(json_path_errors, key=_fn_path, reverse=True) + keeping: dict[str, Iterable[ValidationError]] = {} + for unique_path, grouped_errors in groupby(rev_sort, key=_fn_path): + if any(seen.startswith(unique_path) for seen in keeping): + continue + else: + keeping[unique_path] = [err for _, err in grouped_errors] + yield from reversed(keeping.values()) + + +def _lazy_deduplicate_errors( + grouped_errors: Iterator[Iterable[ValidationError]], / +) -> Iterator[ValidationError]: + for element_errors in grouped_errors: + for validator, errors in groupby( + sorted(element_errors, key=_fn_validator), key=_fn_validator + ): + if validator == "additionalProperties": + errors = _lazy_additional_properties(errors) + elif validator == "enum": + errors = _lazy_enum(errors) + yield from _lazy_unique_message(errors) + + +def _lazy_unique_message( + iterable: Iterable[ValidationError], / +) -> Iterator[ValidationError]: + seen = set() + for el in iterable: + if el.message not in seen: + seen.add(el.message) + yield el + + +def _lazy_additional_properties( + iterable: Iterable[ValidationError], / +) -> Iterator[ValidationError]: + it = iter(iterable) + first = next(it) + if ( + parent := cast("ValidationError", first.parent) + ) and parent.validator == "anyOf": + yield min(_rechain(first, it), key=lambda x: len(x.message)) + else: + yield first + + +def _lazy_enum(iterable: Iterable[ValidationError], /) -> Iterator[ValidationError]: + """ + Temporary reusing the eager version to isolate issues. + + The 3 errors rule applies per group. + """ + # FIXME: Too simple + # Need to do an eager pass, as this skips intersections of non-overlapping enums + # yield reduce(_enum_inner, iterable) + yield from _deduplicate_enum_errors(list(iterable)) + + +def _enum_inner(prev: ValidationError, current: ValidationError, /) -> ValidationError: + """**Disabled**.""" + longest = set(cast("list[str]", prev.validator_value)) + contender = set(cast("list[str]", current.validator_value)) + return current if contender.issuperset(longest) else prev + + def _subset_to_most_specific_json_paths( errors_by_json_path: GroupedValidationErrors, ) -> GroupedValidationErrors: @@ -570,9 +717,8 @@ class SchemaValidationError(jsonschema.ValidationError): def __init__(self, obj: SchemaBase, err: ValidationError) -> None: super().__init__(**err._contents()) self.obj = obj - self._errors: GroupedValidationErrors = getattr( - err, "_all_errors", {_json_path(err): [err]} - ) + err = cast("SchemaValidationError", err) + self._errors: Iterable[ValidationError] = err._errors # This is the message from err self._original_message = self.message self.message = self._get_message() @@ -592,7 +738,10 @@ def indent_second_line_onwards(message: str, indent: int = 4) -> str: error_messages: list[str] = [] # Only show a maximum of 3 errors as else the final message returned by this # method could get very long. - for errors in list(self._errors.values())[:3]: + # ^^^^^^^^^^ + # CORRECTION: Only show 3 **json_paths** + + for errors in islice(_group_errors_by_json_path(self._errors).values(), 3): error_messages.append(self._get_message_for_errors_group(errors)) message = "" @@ -1197,7 +1346,9 @@ def validate( schema = cls._schema # For the benefit of mypy assert schema is not None - validate_jsonschema(instance, schema, rootschema=cls._rootschema or cls._schema) + lazy_validate_json_schema( + instance, schema, rootschema=cls._rootschema or cls._schema + ) @classmethod def resolve_references(cls, schema: dict[str, Any] | None = None) -> dict[str, Any]: @@ -1223,7 +1374,7 @@ def validate_property( np_opt = sys.modules.get("numpy") value = _todict(value, context={}, np_opt=np_opt, pd_opt=pd_opt) props = cls.resolve_references(schema or cls._schema).get("properties", {}) - validate_jsonschema( + lazy_validate_json_schema( value, props.get(name, {}), rootschema=cls._rootschema or cls._schema ) @@ -1394,7 +1545,7 @@ def from_dict( schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) for possible in schemas: try: - validate_jsonschema(dct, possible, rootschema=root_schema) + lazy_validate_json_schema(dct, possible, rootschema=root_schema) except jsonschema.ValidationError: continue else: diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index e7b37b25e..13d75946c 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -9,7 +9,7 @@ from collections import defaultdict from functools import partial from importlib.metadata import version as importlib_version -from itertools import chain, zip_longest +from itertools import chain, groupby, islice, zip_longest from math import ceil from typing import ( TYPE_CHECKING, @@ -32,6 +32,7 @@ import jsonschema import jsonschema.validators import narwhals.stable.v1 as nw +from jsonschema import ValidationError from packaging.version import Version # This leads to circular imports with the vegalite module. Currently, this works @@ -42,7 +43,6 @@ if TYPE_CHECKING: from typing import ClassVar - from jsonschema import ValidationError from jsonschema.protocols import Validator, _JsonParameter from referencing import Registry @@ -61,27 +61,40 @@ ValidationErrorList: TypeAlias = List[jsonschema.ValidationError] GroupedValidationErrors: TypeAlias = Dict[str, ValidationErrorList] -# This URI is arbitrary and could be anything else. It just cannot be an empty -# string as we need to reference the schema registered in -# the referencing.Registry. _VEGA_LITE_ROOT_URI: Final = "urn:vega-lite-schema" +""" +Prefix added to each ``"$ref"``. + +This URI is arbitrary and could be anything else. + +It just cannot be an empty string as we need to reference the schema registered in +the ``referencing.Registry``.""" -# Ideally, jsonschema specification would be parsed from the current Vega-Lite -# schema instead of being hardcoded here as a default value. -# However, due to circular imports between this module and the altair.vegalite -# modules, this information is not yet available at this point as altair.vegalite -# is only partially loaded. The draft version which is used is unlikely to -# change often so it's ok to keep this. There is also a test which validates -# that this value is always the same as in the Vega-Lite schema. _DEFAULT_JSON_SCHEMA_DRAFT_URL: Final = "http://json-schema.org/draft-07/schema#" +""" +Ideally, jsonschema specification would be parsed from the current Vega-Lite +schema instead of being hardcoded here as a default value. +However, due to circular imports between this module and the ``alt.vegalite`` +modules, this information is not yet available at this point as ``alt.vegalite`` +is only partially loaded. + +The draft version which is used is unlikely to change often so it's ok to keep this. +There is also a test which validates that this value is always the same as in the Vega-Lite schema. +""" -# If DEBUG_MODE is True, then schema objects are converted to dict and -# validated at creation time. This slows things down, particularly for -# larger specs, but leads to much more useful tracebacks for the user. -# Individual schema classes can override this by setting the -# class-level _class_is_valid_at_instantiation attribute to False DEBUG_MODE: bool = True +""" +If ``DEBUG_MODE``, then ``SchemaBase`` are converted to ``dict`` and validated at creation time. + +This slows things down, particularly for larger specs, but leads to much more +useful tracebacks for the user. + +Individual schema classes can override with: + + class Derived(SchemaBase): + _class_is_valid_at_instantiation: ClassVar[bool] = False +""" _JSONSCHEMA_VERSION = Version(importlib_version("jsonschema")) _USING_REFERENCING: Final[bool] = _JSONSCHEMA_VERSION >= Version("4.18") # noqa: SIM300 @@ -139,8 +152,6 @@ def validate_jsonschema( *, raise_error: Literal[True] = ..., ) -> Never: ... - - @overload def validate_jsonschema( spec: _JsonParameter, @@ -149,8 +160,6 @@ def validate_jsonschema( *, raise_error: Literal[False], ) -> ValidationError | None: ... - - def validate_jsonschema( spec: _JsonParameter, schema: dict[str, Any], @@ -165,8 +174,9 @@ def validate_jsonschema( and only the most relevant errors are kept. Errors are then either raised or returned, depending on the value of `raise_error`. """ - errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) - if errors: + it_errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) + if first_error := next(it_errors, None): + errors = [first_error, *it_errors] leaf_errors = _get_leaves_of_error_tree(errors) grouped_errors = _group_errors_by_json_path(leaf_errors) grouped_errors = _subset_to_most_specific_json_paths(grouped_errors) @@ -180,7 +190,7 @@ def validate_jsonschema( # error message. Setting a new attribute like this is not ideal as # it then no longer matches the type ValidationError. It would be better # to refactor this function to never raise but only return errors. - main_error._all_errors = grouped_errors + main_error._errors = list(grouped_errors.values()) if raise_error: raise main_error else: @@ -189,14 +199,50 @@ def validate_jsonschema( return None -# NOTE: Entry for creating a `list` of errors -# Everything else is skipped if this returns an empty `list` -# TODO: Refactor to peek at possible error w/ `next(validator.iter_errors(spec))` +def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: + """ + Continue an iterator at the last popped ``element``. + + Equivalent to:: + + elements = 1, 2, 3, 4, 5 + it = iter(elements) + element = next(it) + it_continue = chain([element], it) + + """ + yield element + yield from others + + +def lazy_validate_json_schema( + spec: _JsonParameter, + schema: dict[str, Any], + rootschema: dict[str, Any] | None = None, +) -> None: + """Lazy equivalent of `validate_jsonschema`.""" + it_errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) + if first_error := next(it_errors, None): + groups = _lazy_group_tree_leaves(_rechain(first_error, it_errors)) + most_specific = _lazy_subset_to_most_specific_json_paths(groups) + deduplicated = _lazy_deduplicate_errors(most_specific) + dummy_error: Any + if dummy_error := next(deduplicated, None): + dummy_error._errors = _rechain(dummy_error, deduplicated) # type: ignore[attr-defined] + raise dummy_error + else: + msg = ( + f"Expected to find at least one error, but first error was `None`.\n\n" + f"spec: {spec!r}" + ) + raise NotImplementedError(msg) + + def _get_errors_from_spec( spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, -) -> ValidationErrorList: +) -> Iterator[ValidationError]: """ Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. @@ -238,8 +284,7 @@ def _get_errors_from_spec( ) validator = validator_cls(schema, **validator_kwargs) - errors = list(validator.iter_errors(spec)) - return errors + return validator.iter_errors(spec) def _get_json_schema_draft_url(schema: dict[str, Any]) -> str: @@ -302,9 +347,8 @@ def _get_referencing_registry( ) -# NOTE: Review function (2) def _group_errors_by_json_path( - errors: ValidationErrorList, + errors: Iterable[ValidationError], ) -> GroupedValidationErrors: """ Groups errors by the `json_path` attribute of the jsonschema ValidationError class. @@ -340,6 +384,109 @@ def _get_leaves_of_error_tree( return leaves +def _lazy_group_tree_leaves( + errors: Iterable[ValidationError], / +) -> Iterator[tuple[str, ValidationError]]: + """ + Combines 3 previously distinct steps: + + - ``_get_leaves_of_error_tree`` + - (part of) ``_group_errors_by_json_path`` + - Doesnt actually group yet, can by calling `dict(result)`. + - ``_is_required_value_error`` + """ # noqa: D400 + for err in errors: + if err_context := err.context: + yield from _lazy_group_tree_leaves(err_context) + elif err.validator == "required" and err.validator_value == ["value"]: + continue + else: + yield _json_path(err), err + + +_fn_path = cast("Callable[[tuple[str, ValidationError]], str]", operator.itemgetter(0)) +"""Key function for ``(json_path, ValidationError)``.""" +_fn_validator = cast("Callable[[ValidationError], str]", operator.attrgetter("validator")) # fmt: off +"""Key function for ``ValidationError.validator``.""" + + +def _lazy_subset_to_most_specific_json_paths( + json_path_errors: Iterator[tuple[str, ValidationError]], / +) -> Iterator[Iterable[ValidationError]]: + """ + Currently using a `list`, but typing it more restrictive to see if it can be avoided. + + - Needs to be sorted to work with groupby + - Reversing allows prioritising more specific groups, since they are seen first + - Then re-reversed, to keep seen order + + """ + rev_sort = sorted(json_path_errors, key=_fn_path, reverse=True) + keeping: dict[str, Iterable[ValidationError]] = {} + for unique_path, grouped_errors in groupby(rev_sort, key=_fn_path): + if any(seen.startswith(unique_path) for seen in keeping): + continue + else: + keeping[unique_path] = [err for _, err in grouped_errors] + yield from reversed(keeping.values()) + + +def _lazy_deduplicate_errors( + grouped_errors: Iterator[Iterable[ValidationError]], / +) -> Iterator[ValidationError]: + for element_errors in grouped_errors: + for validator, errors in groupby( + sorted(element_errors, key=_fn_validator), key=_fn_validator + ): + if validator == "additionalProperties": + errors = _lazy_additional_properties(errors) + elif validator == "enum": + errors = _lazy_enum(errors) + yield from _lazy_unique_message(errors) + + +def _lazy_unique_message( + iterable: Iterable[ValidationError], / +) -> Iterator[ValidationError]: + seen = set() + for el in iterable: + if el.message not in seen: + seen.add(el.message) + yield el + + +def _lazy_additional_properties( + iterable: Iterable[ValidationError], / +) -> Iterator[ValidationError]: + it = iter(iterable) + first = next(it) + if ( + parent := cast("ValidationError", first.parent) + ) and parent.validator == "anyOf": + yield min(_rechain(first, it), key=lambda x: len(x.message)) + else: + yield first + + +def _lazy_enum(iterable: Iterable[ValidationError], /) -> Iterator[ValidationError]: + """ + Temporary reusing the eager version to isolate issues. + + The 3 errors rule applies per group. + """ + # FIXME: Too simple + # Need to do an eager pass, as this skips intersections of non-overlapping enums + # yield reduce(_enum_inner, iterable) + yield from _deduplicate_enum_errors(list(iterable)) + + +def _enum_inner(prev: ValidationError, current: ValidationError, /) -> ValidationError: + """**Disabled**.""" + longest = set(cast("list[str]", prev.validator_value)) + contender = set(cast("list[str]", current.validator_value)) + return current if contender.issuperset(longest) else prev + + def _subset_to_most_specific_json_paths( errors_by_json_path: GroupedValidationErrors, ) -> GroupedValidationErrors: @@ -568,9 +715,8 @@ class SchemaValidationError(jsonschema.ValidationError): def __init__(self, obj: SchemaBase, err: ValidationError) -> None: super().__init__(**err._contents()) self.obj = obj - self._errors: GroupedValidationErrors = getattr( - err, "_all_errors", {_json_path(err): [err]} - ) + err = cast("SchemaValidationError", err) + self._errors: Iterable[ValidationError] = err._errors # This is the message from err self._original_message = self.message self.message = self._get_message() @@ -590,7 +736,10 @@ def indent_second_line_onwards(message: str, indent: int = 4) -> str: error_messages: list[str] = [] # Only show a maximum of 3 errors as else the final message returned by this # method could get very long. - for errors in list(self._errors.values())[:3]: + # ^^^^^^^^^^ + # CORRECTION: Only show 3 **json_paths** + + for errors in islice(_group_errors_by_json_path(self._errors).values(), 3): error_messages.append(self._get_message_for_errors_group(errors)) message = "" @@ -1195,7 +1344,9 @@ def validate( schema = cls._schema # For the benefit of mypy assert schema is not None - validate_jsonschema(instance, schema, rootschema=cls._rootschema or cls._schema) + lazy_validate_json_schema( + instance, schema, rootschema=cls._rootschema or cls._schema + ) @classmethod def resolve_references(cls, schema: dict[str, Any] | None = None) -> dict[str, Any]: @@ -1221,7 +1372,7 @@ def validate_property( np_opt = sys.modules.get("numpy") value = _todict(value, context={}, np_opt=np_opt, pd_opt=pd_opt) props = cls.resolve_references(schema or cls._schema).get("properties", {}) - validate_jsonschema( + lazy_validate_json_schema( value, props.get(name, {}), rootschema=cls._rootschema or cls._schema ) @@ -1392,7 +1543,7 @@ def from_dict( schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) for possible in schemas: try: - validate_jsonschema(dct, possible, rootschema=root_schema) + lazy_validate_json_schema(dct, possible, rootschema=root_schema) except jsonschema.ValidationError: continue else: From c261cb46510ce680d99930bfc9a9f1f4762d1d9b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:34:50 +0100 Subject: [PATCH 13/92] perf: Replace message length lambda https://github.com/vega/altair/pull/3547#discussion_r1722938370 --- altair/utils/schemapi.py | 7 ++++++- tools/schemapi/schemapi.py | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 93cbb06c1..b8dcc05a8 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -465,11 +465,16 @@ def _lazy_additional_properties( if ( parent := cast("ValidationError", first.parent) ) and parent.validator == "anyOf": - yield min(_rechain(first, it), key=lambda x: len(x.message)) + yield min(_rechain(first, it), key=_message_len) else: yield first +def _message_len(err: ValidationError, /) -> int: + """Return length of a ``ValidationError`` message.""" + return len(err.message) + + def _lazy_enum(iterable: Iterable[ValidationError], /) -> Iterator[ValidationError]: """ Temporary reusing the eager version to isolate issues. diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 13d75946c..2d63ecdf9 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -463,11 +463,16 @@ def _lazy_additional_properties( if ( parent := cast("ValidationError", first.parent) ) and parent.validator == "anyOf": - yield min(_rechain(first, it), key=lambda x: len(x.message)) + yield min(_rechain(first, it), key=_message_len) else: yield first +def _message_len(err: ValidationError, /) -> int: + """Return length of a ``ValidationError`` message.""" + return len(err.message) + + def _lazy_enum(iterable: Iterable[ValidationError], /) -> Iterator[ValidationError]: """ Temporary reusing the eager version to isolate issues. From bbf6f142ac6a4df0dc6d7e3ca326b9a8d32b413e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:01:51 +0100 Subject: [PATCH 14/92] perf: Use `islice` earlier to prune errors https://github.com/vega/altair/pull/3547#discussion_r1723026972 --- altair/utils/schemapi.py | 4 ++-- tools/schemapi/schemapi.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index b8dcc05a8..535a63cfe 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -430,7 +430,7 @@ def _lazy_subset_to_most_specific_json_paths( continue else: keeping[unique_path] = [err for _, err in grouped_errors] - yield from reversed(keeping.values()) + yield from islice(reversed(keeping.values()), 3) def _lazy_deduplicate_errors( @@ -746,7 +746,7 @@ def indent_second_line_onwards(message: str, indent: int = 4) -> str: # ^^^^^^^^^^ # CORRECTION: Only show 3 **json_paths** - for errors in islice(_group_errors_by_json_path(self._errors).values(), 3): + for errors in _group_errors_by_json_path(self._errors).values(): error_messages.append(self._get_message_for_errors_group(errors)) message = "" diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 2d63ecdf9..6cb154065 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -428,7 +428,7 @@ def _lazy_subset_to_most_specific_json_paths( continue else: keeping[unique_path] = [err for _, err in grouped_errors] - yield from reversed(keeping.values()) + yield from islice(reversed(keeping.values()), 3) def _lazy_deduplicate_errors( @@ -744,7 +744,7 @@ def indent_second_line_onwards(message: str, indent: int = 4) -> str: # ^^^^^^^^^^ # CORRECTION: Only show 3 **json_paths** - for errors in islice(_group_errors_by_json_path(self._errors).values(), 3): + for errors in _group_errors_by_json_path(self._errors).values(): error_messages.append(self._get_message_for_errors_group(errors)) message = "" From 2d20db7dab7715a541aa3e19c0e149dfaa686ace Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:17:07 +0100 Subject: [PATCH 15/92] ci: Add temporary script for `hatch run validation` **Remove before review**. Using for quicker feedback loop, where running mypy and all tests are not beneficial --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 4a0c3874c..d061df401 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -130,6 +130,13 @@ update-init-file = [ "ruff format .", ] +# Much more isolated, focused purely on a faster `schemapi.py` rebuild/test loop. +validation = [ + "mypy tools/schemapi/schemapi.py", + "python tools/generate_schema_wrapper.py", + "pytest -k test_schemapi tests {args}", +] + [tool.hatch.envs.hatch-test] # https://hatch.pypa.io/latest/tutorials/testing/overview/ features = ["all", "dev", "doc"] From ad06d080d561009d6ce0fddd228364f6e82c5171 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 20 Aug 2024 14:09:13 +0100 Subject: [PATCH 16/92] feat(perf): Adds `_lazy_deduplicate_enum` https://github.com/vega/altair/pull/3547#discussion_r1723029403 --- altair/utils/schemapi.py | 29 +++++++++++------------------ tools/schemapi/schemapi.py | 29 +++++++++++------------------ 2 files changed, 22 insertions(+), 36 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 535a63cfe..3e1d75e53 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -443,7 +443,7 @@ def _lazy_deduplicate_errors( if validator == "additionalProperties": errors = _lazy_additional_properties(errors) elif validator == "enum": - errors = _lazy_enum(errors) + errors = _lazy_deduplicate_enum(errors) yield from _lazy_unique_message(errors) @@ -475,23 +475,16 @@ def _message_len(err: ValidationError, /) -> int: return len(err.message) -def _lazy_enum(iterable: Iterable[ValidationError], /) -> Iterator[ValidationError]: - """ - Temporary reusing the eager version to isolate issues. - - The 3 errors rule applies per group. - """ - # FIXME: Too simple - # Need to do an eager pass, as this skips intersections of non-overlapping enums - # yield reduce(_enum_inner, iterable) - yield from _deduplicate_enum_errors(list(iterable)) - - -def _enum_inner(prev: ValidationError, current: ValidationError, /) -> ValidationError: - """**Disabled**.""" - longest = set(cast("list[str]", prev.validator_value)) - contender = set(cast("list[str]", current.validator_value)) - return current if contender.issuperset(longest) else prev +def _lazy_deduplicate_enum( + iterable: Iterable[ValidationError], / +) -> Iterator[ValidationError]: + """Skip any``"enum"`` errors that are a subset of another error.""" + enums: tuple[set[str], ...] + errors: tuple[ValidationError, ...] + enums, errors = zip(*((set(err.validator_value), err) for err in iterable)) # type: ignore[arg-type] + for cur_enum, err in zip(enums, errors): + if not any(cur_enum < e for e in enums if e != cur_enum): + yield err def _subset_to_most_specific_json_paths( diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 6cb154065..b4995c070 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -441,7 +441,7 @@ def _lazy_deduplicate_errors( if validator == "additionalProperties": errors = _lazy_additional_properties(errors) elif validator == "enum": - errors = _lazy_enum(errors) + errors = _lazy_deduplicate_enum(errors) yield from _lazy_unique_message(errors) @@ -473,23 +473,16 @@ def _message_len(err: ValidationError, /) -> int: return len(err.message) -def _lazy_enum(iterable: Iterable[ValidationError], /) -> Iterator[ValidationError]: - """ - Temporary reusing the eager version to isolate issues. - - The 3 errors rule applies per group. - """ - # FIXME: Too simple - # Need to do an eager pass, as this skips intersections of non-overlapping enums - # yield reduce(_enum_inner, iterable) - yield from _deduplicate_enum_errors(list(iterable)) - - -def _enum_inner(prev: ValidationError, current: ValidationError, /) -> ValidationError: - """**Disabled**.""" - longest = set(cast("list[str]", prev.validator_value)) - contender = set(cast("list[str]", current.validator_value)) - return current if contender.issuperset(longest) else prev +def _lazy_deduplicate_enum( + iterable: Iterable[ValidationError], / +) -> Iterator[ValidationError]: + """Skip any``"enum"`` errors that are a subset of another error.""" + enums: tuple[set[str], ...] + errors: tuple[ValidationError, ...] + enums, errors = zip(*((set(err.validator_value), err) for err in iterable)) # type: ignore[arg-type] + for cur_enum, err in zip(enums, errors): + if not any(cur_enum < e for e in enums if e != cur_enum): + yield err def _subset_to_most_specific_json_paths( From af17d441db83cabe7fcd2088eaa2df75bc00acaa Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 00:18:58 +0100 Subject: [PATCH 17/92] refactor(perf): Tidying up, more lazy ops, renaming Note to self: Fill out comments re individual changes. --- altair/utils/schemapi.py | 637 ++++++++++++++----------------------- tools/schemapi/schemapi.py | 637 ++++++++++++++----------------------- 2 files changed, 464 insertions(+), 810 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 3e1d75e53..248193a85 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -21,15 +21,14 @@ Final, Iterable, Iterator, + KeysView, List, - Literal, Sequence, TypeVar, Union, cast, overload, ) -from typing_extensions import TypeAlias import jsonschema import jsonschema.validators @@ -37,11 +36,6 @@ from jsonschema import ValidationError from packaging.version import Version -# This leads to circular imports with the vegalite module. Currently, this works -# but be aware that when you access it in this script, the vegalite module might -# not yet be fully instantiated in case your code is being executed during import time -from altair import vegalite - if TYPE_CHECKING: from typing import ClassVar @@ -59,9 +53,15 @@ from typing import Never, Self else: from typing_extensions import Never, Self + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + _Errs: TypeAlias = Iterable[ValidationError] + _ErrsLazy: TypeAlias = Iterator[ValidationError] + _ErrsLazyGroup: TypeAlias = Iterator[_ErrsLazy] + _IntoLazyGroup: TypeAlias = Iterator["tuple[str, ValidationError]"] -ValidationErrorList: TypeAlias = List[jsonschema.ValidationError] -GroupedValidationErrors: TypeAlias = Dict[str, ValidationErrorList] _VEGA_LITE_ROOT_URI: Final = "urn:vega-lite-schema" """ @@ -146,61 +146,6 @@ def debug_mode(arg: bool) -> Iterator[None]: DEBUG_MODE = original -@overload -def validate_jsonschema( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = ..., - *, - raise_error: Literal[True] = ..., -) -> Never: ... -@overload -def validate_jsonschema( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = ..., - *, - raise_error: Literal[False], -) -> ValidationError | None: ... -def validate_jsonschema( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, - *, - raise_error: bool = True, -) -> ValidationError | None: - """ - Validates the passed in spec against the schema in the context of the rootschema. - - If any errors are found, they are deduplicated and prioritized - and only the most relevant errors are kept. Errors are then either raised - or returned, depending on the value of `raise_error`. - """ - it_errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) - if first_error := next(it_errors, None): - errors = [first_error, *it_errors] - leaf_errors = _get_leaves_of_error_tree(errors) - grouped_errors = _group_errors_by_json_path(leaf_errors) - grouped_errors = _subset_to_most_specific_json_paths(grouped_errors) - grouped_errors = _deduplicate_errors(grouped_errors) - - # Nothing special about this first error but we need to choose one - # which can be raised - main_error: Any = next(iter(grouped_errors.values()))[0] - # All errors are then attached as a new attribute to ValidationError so that - # they can be used in SchemaValidationError to craft a more helpful - # error message. Setting a new attribute like this is not ideal as - # it then no longer matches the type ValidationError. It would be better - # to refactor this function to never raise but only return errors. - main_error._errors = list(grouped_errors.values()) - if raise_error: - raise main_error - else: - return main_error - else: - return None - - def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: """ Continue an iterator at the last popped ``element``. @@ -217,20 +162,44 @@ def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: yield from others -def lazy_validate_json_schema( +def _regroup(errors: _Errs, /) -> _ErrsLazyGroup: + """ + Regroup error stream with the assumption they are already sorted. + + This holds **only after** all other stages. + """ + for _, grouped_it in groupby(errors, _json_path): + yield grouped_it + + +def validate_jsonschema( spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, ) -> None: - """Lazy equivalent of `validate_jsonschema`.""" - it_errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) + """ + Lazy equivalent of `validate_jsonschema`. + + Validates the passed in spec against the schema in the context of the rootschema. + + If any errors are found, they are deduplicated and prioritized + and only the most relevant errors are kept. + + Nothing special about this first error but we need to choose one + which can be raised + All errors are then attached as a new attribute to ValidationError so that + they can be used in SchemaValidationError to craft a more helpful + error message. Setting a new attribute like this is not ideal as + it then no longer matches the type ValidationError. + """ + it_errors = _iter_errors_from_spec(spec, schema, rootschema=rootschema) if first_error := next(it_errors, None): - groups = _lazy_group_tree_leaves(_rechain(first_error, it_errors)) - most_specific = _lazy_subset_to_most_specific_json_paths(groups) - deduplicated = _lazy_deduplicate_errors(most_specific) + groups = _group_tree_leaves(_rechain(first_error, it_errors)) + most_specific = _prune_subset_paths(groups) + deduplicated = _deduplicate_errors(most_specific) dummy_error: Any if dummy_error := next(deduplicated, None): - dummy_error._errors = _rechain(dummy_error, deduplicated) # type: ignore[attr-defined] + dummy_error._errors = _regroup(_rechain(dummy_error, deduplicated)) # type: ignore[attr-defined] raise dummy_error else: msg = ( @@ -240,11 +209,27 @@ def lazy_validate_json_schema( raise NotImplementedError(msg) -def _get_errors_from_spec( +def validate_jsonschema_fail_fast( + spec: _JsonParameter, + schema: dict[str, Any], + rootschema: dict[str, Any] | None = None, +) -> None: + """ + Raise as quickly as possible. + + Use when any information about the error is not needed. + """ + if ( + err := next(_iter_errors_from_spec(spec, schema, rootschema=rootschema), None) + ) is not None: + raise err + + +def _iter_errors_from_spec( spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, -) -> Iterator[ValidationError]: +) -> _ErrsLazy: """ Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. @@ -324,82 +309,57 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v -# We do not annotate the return value here as the referencing library is not always -# available and this function is only executed in those cases. def _get_referencing_registry( rootschema: dict[str, Any], json_schema_draft_url: str | None = None ) -> Registry: - # Referencing is a dependency of newer jsonschema versions, starting with the - # version that is specified in _use_referencing_library and we therefore - # can expect that it is installed if the function returns True. - # We ignore 'import' mypy errors which happen when the referencing library - # is not installed. That's ok as in these cases this function is not called. - # We also have to ignore 'unused-ignore' errors as mypy raises those in case - # referencing is installed. - import referencing # type: ignore[import,unused-ignore] - import referencing.jsonschema # type: ignore[import,unused-ignore] - - if json_schema_draft_url is None: - json_schema_draft_url = _get_json_schema_draft_url(rootschema) - - specification = referencing.jsonschema.specification_with(json_schema_draft_url) - resource = specification.create_resource(rootschema) - return referencing.Registry().with_resource( - uri=_VEGA_LITE_ROOT_URI, resource=resource - ) + """ + Referencing is a dependency of newer jsonschema versions. + See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 -def _group_errors_by_json_path( - errors: Iterable[ValidationError], -) -> GroupedValidationErrors: - """ - Groups errors by the `json_path` attribute of the jsonschema ValidationError class. + We ignore 'import' ``mypy`` errors which happen when the ``referencing`` library + is not installed. + That's ok as in these cases this function is not called. - This attribute contains the path to the offending element within - a chart specification and can therefore be considered as an identifier of an - 'issue' in the chart that needs to be fixed. + We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case + ``referencing`` is installed. """ - errors_by_json_path = defaultdict(list) - for err in errors: - errors_by_json_path[_json_path(err)].append(err) - return dict(errors_by_json_path) + from referencing import Registry # type: ignore[import,unused-ignore] # noqa: I001 + from referencing.jsonschema import specification_with # type: ignore[import,unused-ignore] + + dialect_id = json_schema_draft_url or _get_json_schema_draft_url(rootschema) + specification = specification_with(dialect_id) + resource = specification.create_resource(rootschema) + return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) -def _get_leaves_of_error_tree( - errors: ValidationErrorList, -) -> ValidationErrorList: +def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: """ - For each error in `errors`, it traverses down the "error tree" that is generated by the jsonschema library to find and return all "leaf" errors. + Combines 3 previously distinct steps: + + 1. ``_get_leaves_of_error_tree`` These are errors which have no further errors that caused it and so they are the most specific errors with the most specific error messages. - """ - leaves: ValidationErrorList = [] - for err in errors: - if err.context: - # This means that the error `err` was caused by errors in subschemas. - # The list of errors from the subschemas are available in the property - # `context`. - leaves.extend(_get_leaves_of_error_tree(err.context)) - else: - leaves.append(err) - return leaves + 2. ``_group_errors_by_json_path`` (part of) -def _lazy_group_tree_leaves( - errors: Iterable[ValidationError], / -) -> Iterator[tuple[str, ValidationError]]: - """ - Combines 3 previously distinct steps: + Extracts the path for grouping. - - ``_get_leaves_of_error_tree`` - - (part of) ``_group_errors_by_json_path`` - - Doesnt actually group yet, can by calling `dict(result)`. - - ``_is_required_value_error`` + 3. Removes:: + + ValidationError: "'value' is a required property" + + as these errors are unlikely to be the relevant ones for the user. + They come from validation against a schema definition where the output of `alt.value` + would be valid. + However, if a user uses `alt.value`, the `value` keyword is included automatically + from that function and so it's unlikely that this was what the user intended + if the keyword is not present in the first place. """ # noqa: D400 for err in errors: if err_context := err.context: - yield from _lazy_group_tree_leaves(err_context) + yield from _group_tree_leaves(err_context) elif err.validator == "required" and err.validator_value == ["value"]: continue else: @@ -412,19 +372,28 @@ def _lazy_group_tree_leaves( """Key function for ``ValidationError.validator``.""" -def _lazy_subset_to_most_specific_json_paths( - json_path_errors: Iterator[tuple[str, ValidationError]], / -) -> Iterator[Iterable[ValidationError]]: +def _message_len(err: ValidationError, /) -> int: + """Return length of a ``ValidationError`` message.""" + return len(err.message) + + +def _prune_subset_paths(json_path_errors: _IntoLazyGroup, /) -> Iterator[_Errs]: """ + Removes key (json path), value (errors) pairs where the json path is fully contained in another json path. + + For example if `errors_by_json_path` has two keys, `$.encoding.X` and `$.encoding.X.tooltip`, + then the first one will be removed and only the second one is returned. + + This is done under the assumption that more specific json paths give more helpful error messages to the user. + Currently using a `list`, but typing it more restrictive to see if it can be avoided. - Needs to be sorted to work with groupby - Reversing allows prioritising more specific groups, since they are seen first - Then re-reversed, to keep seen order - """ rev_sort = sorted(json_path_errors, key=_fn_path, reverse=True) - keeping: dict[str, Iterable[ValidationError]] = {} + keeping: dict[str, _Errs] = {} for unique_path, grouped_errors in groupby(rev_sort, key=_fn_path): if any(seen.startswith(unique_path) for seen in keeping): continue @@ -433,23 +402,35 @@ def _lazy_subset_to_most_specific_json_paths( yield from islice(reversed(keeping.values()), 3) -def _lazy_deduplicate_errors( - grouped_errors: Iterator[Iterable[ValidationError]], / -) -> Iterator[ValidationError]: - for element_errors in grouped_errors: - for validator, errors in groupby( - sorted(element_errors, key=_fn_validator), key=_fn_validator - ): +def _groupby_validator(errors: _Errs, /) -> Iterator[tuple[str, _ErrsLazy]]: + """ + Groups the errors by the json schema "validator" that casued the error. + + For example if the error is that a value is not one of an enumeration in the json schema + then the "validator" is `"enum"`, if the error is due to an unknown property that + was set although no additional properties are allowed then "validator" is + `"additionalProperties`, etc. + """ + yield from groupby(sorted(errors, key=_fn_validator), key=_fn_validator) + + +def _deduplicate_errors(grouped_errors: Iterator[_Errs], /) -> _ErrsLazy: + """ + Some errors have very similar error messages or are just in general not helpful for a user. + + This function removes as many of these cases as possible and + can be extended over time to handle new cases that come up. + """ + for by_path in grouped_errors: + for validator, errors in _groupby_validator(by_path): if validator == "additionalProperties": - errors = _lazy_additional_properties(errors) + errors = _shortest_any_of(errors) elif validator == "enum": - errors = _lazy_deduplicate_enum(errors) - yield from _lazy_unique_message(errors) + errors = _prune_subset_enum(errors) + yield from _distinct_messages(errors) -def _lazy_unique_message( - iterable: Iterable[ValidationError], / -) -> Iterator[ValidationError]: +def _distinct_messages(iterable: _Errs, /) -> _ErrsLazy: seen = set() for el in iterable: if el.message not in seen: @@ -457,9 +438,19 @@ def _lazy_unique_message( yield el -def _lazy_additional_properties( - iterable: Iterable[ValidationError], / -) -> Iterator[ValidationError]: +def _shortest_any_of(iterable: _Errs, /) -> _ErrsLazy: + """ + If there are multiple additional property errors it usually means that the offending element was validated against multiple schemas and its parent is a common anyOf validator. + + The error messages produced from these cases are usually + very similar and we just take the shortest one. + For example the following 3 errors are raised for:: + + alt.X("variety", unknown=2) + - "Additional properties are not allowed ('unknown' was unexpected)" + - "Additional properties are not allowed ('field', 'unknown' were unexpected)" + - "Additional properties are not allowed ('field', 'type', 'unknown' were unexpected)". + """ it = iter(iterable) first = next(it) if ( @@ -470,14 +461,7 @@ def _lazy_additional_properties( yield first -def _message_len(err: ValidationError, /) -> int: - """Return length of a ``ValidationError`` message.""" - return len(err.message) - - -def _lazy_deduplicate_enum( - iterable: Iterable[ValidationError], / -) -> Iterator[ValidationError]: +def _prune_subset_enum(iterable: _Errs, /) -> _ErrsLazy: """Skip any``"enum"`` errors that are a subset of another error.""" enums: tuple[set[str], ...] errors: tuple[ValidationError, ...] @@ -487,157 +471,14 @@ def _lazy_deduplicate_enum( yield err -def _subset_to_most_specific_json_paths( - errors_by_json_path: GroupedValidationErrors, -) -> GroupedValidationErrors: - """ - Removes key (json path), value (errors) pairs where the json path is fully contained in another json path. - - For example if `errors_by_json_path` has two keys, `$.encoding.X` and `$.encoding.X.tooltip`, - then the first one will be removed and only the second one is returned. - - This is done under the assumption that more specific json paths give more helpful error messages to the user. - """ - errors_by_json_path_specific: GroupedValidationErrors = {} - for json_path, errors in errors_by_json_path.items(): - if not _contained_at_start_of_one_of_other_values( - json_path, list(errors_by_json_path.keys()) - ): - errors_by_json_path_specific[json_path] = errors - return errors_by_json_path_specific - - -def _contained_at_start_of_one_of_other_values(x: str, values: Sequence[str]) -> bool: - # Does not count as "contained at start of other value" if the values are - # the same. These cases should be handled separately - return any(value.startswith(x) for value in values if x != value) - - -def _deduplicate_errors( - grouped_errors: GroupedValidationErrors, -) -> GroupedValidationErrors: - """ - Some errors have very similar error messages or are just in general not helpful for a user. - - This function removes as many of these cases as possible and - can be extended over time to handle new cases that come up. - """ - grouped_errors_deduplicated: GroupedValidationErrors = {} - for json_path, element_errors in grouped_errors.items(): - errors_by_validator = _group_errors_by_validator(element_errors) - - deduplication_functions = { - "enum": _deduplicate_enum_errors, - "additionalProperties": _deduplicate_additional_properties_errors, - } - deduplicated_errors: ValidationErrorList = [] - for validator, errors in errors_by_validator.items(): - deduplication_func = deduplication_functions.get(validator) - if deduplication_func is not None: - errors = deduplication_func(errors) - deduplicated_errors.extend(_deduplicate_by_message(errors)) - - # Removes any ValidationError "'value' is a required property" as these - # errors are unlikely to be the relevant ones for the user. They come from - # validation against a schema definition where the output of `alt.value` - # would be valid. However, if a user uses `alt.value`, the `value` keyword - # is included automatically from that function and so it's unlikely - # that this was what the user intended if the keyword is not present - # in the first place. - deduplicated_errors = [ - err for err in deduplicated_errors if not _is_required_value_error(err) - ] - - grouped_errors_deduplicated[json_path] = deduplicated_errors - return grouped_errors_deduplicated - - -def _is_required_value_error(err: ValidationError) -> bool: - return err.validator == "required" and err.validator_value == ["value"] - - -def _group_errors_by_validator(errors: ValidationErrorList) -> GroupedValidationErrors: - """ - Groups the errors by the json schema "validator" that casued the error. - - For example if the error is that a value is not one of an enumeration in the json schema - then the "validator" is `"enum"`, if the error is due to an unknown property that - was set although no additional properties are allowed then "validator" is - `"additionalProperties`, etc. - """ - errors_by_validator: defaultdict[str, ValidationErrorList] = defaultdict(list) - for err in errors: - # Ignore mypy error as err.validator as it wrongly sees err.validator - # as of type Optional[Validator] instead of str which it is according - # to the documentation and all tested cases - errors_by_validator[err.validator].append(err) # type: ignore[index] - return dict(errors_by_validator) - - -def _deduplicate_enum_errors(errors: ValidationErrorList) -> ValidationErrorList: - """ - Deduplicate enum errors by removing the errors where the allowed values are a subset of another error. - - For example, if `enum` contains two errors and one has `validator_value` (i.e. accepted values) ["A", "B"] and the - other one ["A", "B", "C"] then the first one is removed and the final - `enum` list only contains the error with ["A", "B", "C"]. - """ - if len(errors) > 1: - # Values (and therefore `validator_value`) of an enum are always arrays, - # see https://json-schema.org/understanding-json-schema/reference/generic.html#enumerated-values - # which is why we can use join below - value_strings = [",".join(err.validator_value) for err in errors] # type: ignore - longest_enums: ValidationErrorList = [] - for value_str, err in zip(value_strings, errors): - if not _contained_at_start_of_one_of_other_values(value_str, value_strings): - longest_enums.append(err) - errors = longest_enums - return errors - - -def _deduplicate_additional_properties_errors( - errors: ValidationErrorList, -) -> ValidationErrorList: - """ - If there are multiple additional property errors it usually means that the offending element was validated against multiple schemas and its parent is a common anyOf validator. - - The error messages produced from these cases are usually - very similar and we just take the shortest one. For example, - the following 3 errors are raised for the `unknown` channel option in - `alt.X("variety", unknown=2)`: - - "Additional properties are not allowed ('unknown' was unexpected)" - - "Additional properties are not allowed ('field', 'unknown' were unexpected)" - - "Additional properties are not allowed ('field', 'type', 'unknown' were unexpected)". - """ - if len(errors) > 1: - # Test if all parent errors are the same anyOf error and only do - # the prioritization in these cases. Can't think of a chart spec where this - # would not be the case but still allow for it below to not break anything. - parent = errors[0].parent - if ( - parent is not None - and parent.validator == "anyOf" - # Use [1:] as don't have to check for first error as it was used - # above to define `parent` - and all(err.parent is parent for err in errors[1:]) - ): - errors = [min(errors, key=lambda x: len(x.message))] - return errors - - -def _deduplicate_by_message(errors: ValidationErrorList) -> ValidationErrorList: - """Deduplicate errors by message. This keeps the original order in case it was chosen intentionally.""" - return list({e.message: e for e in errors}.values()) - - def _subclasses(cls: type[Any]) -> Iterator[type[Any]]: """Breadth-first sequence of all classes which inherit from cls.""" seen = set() - current_set = {cls} - while current_set: - seen |= current_set - current_set = set.union(*(set(cls.__subclasses__()) for cls in current_set)) - for cls in current_set - seen: + current: set[type[Any]] = {cls} + while current: + seen |= current + current = set(chain.from_iterable(cls.__subclasses__() for cls in current)) + for cls in current - seen: yield cls @@ -716,7 +557,7 @@ def __init__(self, obj: SchemaBase, err: ValidationError) -> None: super().__init__(**err._contents()) self.obj = obj err = cast("SchemaValidationError", err) - self._errors: Iterable[ValidationError] = err._errors + self._errors: _ErrsLazyGroup = err._errors # This is the message from err self._original_message = self.message self.message = self._get_message() @@ -724,49 +565,47 @@ def __init__(self, obj: SchemaBase, err: ValidationError) -> None: def __str__(self) -> str: return self.message + @staticmethod + def indent_from_second_line(msg: str, /, indent: int = 4) -> str: + return "\n".join( + " " * indent + s if idx > 0 and s else s + for idx, s in enumerate(msg.split("\n")) + ) + def _get_message(self) -> str: - def indent_second_line_onwards(message: str, indent: int = 4) -> str: - modified_lines: list[str] = [] - for idx, line in enumerate(message.split("\n")): - if idx > 0 and len(line) > 0: - line = " " * indent + line - modified_lines.append(line) - return "\n".join(modified_lines) - - error_messages: list[str] = [] - # Only show a maximum of 3 errors as else the final message returned by this - # method could get very long. - # ^^^^^^^^^^ - # CORRECTION: Only show 3 **json_paths** - - for errors in _group_errors_by_json_path(self._errors).values(): - error_messages.append(self._get_message_for_errors_group(errors)) - - message = "" - if len(error_messages) > 1: - error_messages = [ - indent_second_line_onwards(f"Error {error_id}: {m}") + it = self._errors + group_1 = list(next(it)) + if (group_2 := next(it, None)) is not None: + error_messages = [] + for group in group_1, list(group_2), next(it, None): + if group is not None: + error_messages.append(self._get_message_for_errors_group(group)) + message = "\n\n".join( + self.indent_from_second_line(f"Error {error_id}: {m}") for error_id, m in enumerate(error_messages, start=1) - ] - message += "Multiple errors were found.\n\n" - message += "\n\n".join(error_messages) - return message + ) + return f"Multiple errors were found.\n\n{message}" + else: + return self._get_message_for_errors_group(group_1) - def _get_message_for_errors_group( - self, - errors: ValidationErrorList, - ) -> str: + def _get_message_for_errors_group(self, errors: _Errs) -> str: + """ + Note. + + During development, we only found cases where an additionalProperties + error was raised if that was the only error for the offending instance + as identifiable by the json path. + + Therefore, we just check here the first error. + However, other constellations might exist in which case this should be adapted + so that other error messages are shown as well. + """ + if not isinstance(errors, Sequence): + errors = list(errors) if errors[0].validator == "additionalProperties": - # During development, we only found cases where an additionalProperties - # error was raised if that was the only error for the offending instance - # as identifiable by the json path. Therefore, we just check here the first - # error. However, other constellations might exist in which case - # this should be adapted so that other error messages are shown as well. - message = self._get_additional_properties_error_message(errors[0]) + return self._get_additional_properties_error_message(errors[0]) else: - message = self._get_default_error_message(errors=errors) - - return message.strip() + return self._get_default_error_message(errors=errors) def _get_additional_properties_error_message( self, @@ -781,13 +620,12 @@ def _get_additional_properties_error_message( # "Additional properties are not allowed ('unknown' was unexpected)" # Line below extracts "unknown" from this string parameter_name = error.message.split("('")[-1].split("'")[0] - message = f"""\ -`{altair_cls.__name__}` has no parameter named '{parameter_name}' - -Existing parameter names are: -{param_names_table} -See the help for `{altair_cls.__name__}` to read the full description of these parameters""" - return message + cls_name = altair_cls.__name__ + return ( + f"`{cls_name}` has no parameter named '{parameter_name}'\n\n" + f"Existing parameter names are:\n{param_names_table}\n" + f"See the help for `{cls_name}` to read the full description of these parameters" + ) def _get_altair_class_for_error(self, error: ValidationError) -> type[SchemaBase]: """ @@ -795,6 +633,8 @@ def _get_altair_class_for_error(self, error: ValidationError) -> type[SchemaBase This should lead to more informative error messages pointing the user closer to the source of the issue. """ + from altair import vegalite + for prop_name in reversed(error.absolute_path): # Check if str as e.g. first item can be a 0 if isinstance(prop_name, str): @@ -806,24 +646,17 @@ def _get_altair_class_for_error(self, error: ValidationError) -> type[SchemaBase # Did not find a suitable class based on traversing the path so we fall # back on the class of the top-level object which created # the SchemaValidationError - cls = self.obj.__class__ + cls = type(self.obj) return cls @staticmethod - def _format_params_as_table(param_dict_keys: Iterable[str]) -> str: + def _format_params_as_table(param_view: KeysView[str]) -> str: """Format param names into a table so that they are easier to read.""" - param_names: tuple[str, ...] - name_lengths: tuple[int, ...] - param_names, name_lengths = zip( - *[ - (name, len(name)) - for name in param_dict_keys - if name not in {"kwds", "self"} - ] - ) + param_names: list[str] = [nm for nm in param_view if nm not in {"kwds", "self"}] + # Worst case scenario with the same longest param name in the same # row for all columns - max_name_length = max(name_lengths) + max_name_length = len(max(param_view, key=len)) max_column_width = 80 # Output a square table if not too big (since it is easier to read) num_param_names = len(param_names) @@ -837,7 +670,7 @@ def split_into_equal_parts(n: int, p: int) -> list[int]: column_heights = split_into_equal_parts(num_param_names, columns) # Section the param names into columns and compute their widths - param_names_columns: list[tuple[str, ...]] = [] + param_names_columns: list[Sequence[str]] = [] column_max_widths: list[int] = [] last_end_idx: int = 0 for ch in column_heights: @@ -848,30 +681,29 @@ def split_into_equal_parts(n: int, p: int) -> list[int]: last_end_idx = ch + last_end_idx # Transpose the param name columns into rows to facilitate looping - param_names_rows: list[tuple[str, ...]] = [] - for li in zip_longest(*param_names_columns, fillvalue=""): - param_names_rows.append(li) # Build the table as a string by iterating over and formatting the rows param_names_table: str = "" - for param_names_row in param_names_rows: + column_pad = 3 + for param_names_row in zip_longest(*param_names_columns, fillvalue=""): + last_element = len(param_names_row) - 1 for num, param_name in enumerate(param_names_row): # Set column width based on the longest param in the column - max_name_length_column = column_max_widths[num] - column_pad = 3 - param_names_table += "{:<{}}".format( - param_name, max_name_length_column + column_pad - ) + width = column_pad + column_max_widths[num] + param_names_table += "{:<{}}".format(param_name, width) # Insert newlines and spacing after the last element in each row - if num == (len(param_names_row) - 1): + if num == last_element: param_names_table += "\n" return param_names_table def _get_default_error_message( self, - errors: ValidationErrorList, + errors: Sequence[ValidationError], ) -> str: bullet_points: list[str] = [] - errors_by_validator = _group_errors_by_validator(errors) + errors_by_validator: defaultdict[str, list[ValidationError]] = defaultdict(list) + for err in errors: + errors_by_validator[err.validator].append(err) # type: ignore[index] + if "enum" in errors_by_validator: for error in errors_by_validator["enum"]: bullet_points.append(f"one of {error.validator_value}") @@ -919,7 +751,7 @@ def _get_default_error_message( if validator not in {"enum", "type"} ) message += "".join(it) - return message + return message.strip() class UndefinedType: @@ -1215,7 +1047,7 @@ def to_dict( if validate: try: self.validate(result) - except jsonschema.ValidationError as err: + except ValidationError as err: # We do not raise `from err` as else the resulting # traceback is very long as it contains part # of the Vega-Lite schema. It would also first @@ -1340,12 +1172,8 @@ def validate( cls, instance: dict[str, Any], schema: dict[str, Any] | None = None ) -> None: """Validate the instance against the class schema in the context of the rootschema.""" - if schema is None: - schema = cls._schema - # For the benefit of mypy - assert schema is not None - lazy_validate_json_schema( - instance, schema, rootschema=cls._rootschema or cls._schema + validate_jsonschema( + instance, schema or cls._schema, cls._rootschema or cls._schema ) @classmethod @@ -1372,7 +1200,7 @@ def validate_property( np_opt = sys.modules.get("numpy") value = _todict(value, context={}, np_opt=np_opt, pd_opt=pd_opt) props = cls.resolve_references(schema or cls._schema).get("properties", {}) - lazy_validate_json_schema( + validate_jsonschema( value, props.get(name, {}), rootschema=cls._rootschema or cls._schema ) @@ -1408,6 +1236,17 @@ def _passthrough(*args: Any, **kwds: Any) -> Any | dict[str, Any]: return args[0] if args else kwds +def _freeze(val): + if isinstance(val, dict): + return frozenset((k, _freeze(v)) for k, v in val.items()) + elif isinstance(val, set): + return frozenset(_freeze(v) for v in val) + elif isinstance(val, (list, tuple)): + return tuple(_freeze(v) for v in val) + else: + return val + + class _FromDict: """ Class used to construct SchemaBase class hierarchies from a dict. @@ -1445,22 +1284,8 @@ def hash_schema(cls, schema: dict[str, Any], use_json: bool = True) -> int: for key, val in schema.items() if key not in cls._hash_exclude_keys } - if use_json: - s = json.dumps(schema, sort_keys=True) - return hash(s) - else: - - def _freeze(val): - if isinstance(val, dict): - return frozenset((k, _freeze(v)) for k, v in val.items()) - elif isinstance(val, set): - return frozenset(map(_freeze, val)) - elif isinstance(val, (list, tuple)): - return tuple(map(_freeze, val)) - else: - return val - - return hash(_freeze(schema)) + s: Any = json.dumps(schema, sort_keys=True) if use_json else _freeze(schema) + return hash(s) @overload def from_dict( @@ -1543,8 +1368,8 @@ def from_dict( schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) for possible in schemas: try: - lazy_validate_json_schema(dct, possible, rootschema=root_schema) - except jsonschema.ValidationError: + validate_jsonschema_fail_fast(dct, possible, rootschema=root_schema) + except ValidationError: continue else: return from_dict(dct, schema=possible, default_class=target_tp) @@ -1571,6 +1396,8 @@ def __init__(self, prop: str, schema: dict[str, Any]) -> None: self.schema = schema def __get__(self, obj, cls): + from altair import vegalite + self.obj = obj self.cls = cls # The docs from the encoding class parameter (e.g. `bin` in X, Color, diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index b4995c070..57bf63b23 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -19,15 +19,14 @@ Final, Iterable, Iterator, + KeysView, List, - Literal, Sequence, TypeVar, Union, cast, overload, ) -from typing_extensions import TypeAlias import jsonschema import jsonschema.validators @@ -35,11 +34,6 @@ from jsonschema import ValidationError from packaging.version import Version -# This leads to circular imports with the vegalite module. Currently, this works -# but be aware that when you access it in this script, the vegalite module might -# not yet be fully instantiated in case your code is being executed during import time -from altair import vegalite - if TYPE_CHECKING: from typing import ClassVar @@ -57,9 +51,15 @@ from typing import Never, Self else: from typing_extensions import Never, Self + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + _Errs: TypeAlias = Iterable[ValidationError] + _ErrsLazy: TypeAlias = Iterator[ValidationError] + _ErrsLazyGroup: TypeAlias = Iterator[_ErrsLazy] + _IntoLazyGroup: TypeAlias = Iterator["tuple[str, ValidationError]"] -ValidationErrorList: TypeAlias = List[jsonschema.ValidationError] -GroupedValidationErrors: TypeAlias = Dict[str, ValidationErrorList] _VEGA_LITE_ROOT_URI: Final = "urn:vega-lite-schema" """ @@ -144,61 +144,6 @@ def debug_mode(arg: bool) -> Iterator[None]: DEBUG_MODE = original -@overload -def validate_jsonschema( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = ..., - *, - raise_error: Literal[True] = ..., -) -> Never: ... -@overload -def validate_jsonschema( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = ..., - *, - raise_error: Literal[False], -) -> ValidationError | None: ... -def validate_jsonschema( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, - *, - raise_error: bool = True, -) -> ValidationError | None: - """ - Validates the passed in spec against the schema in the context of the rootschema. - - If any errors are found, they are deduplicated and prioritized - and only the most relevant errors are kept. Errors are then either raised - or returned, depending on the value of `raise_error`. - """ - it_errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) - if first_error := next(it_errors, None): - errors = [first_error, *it_errors] - leaf_errors = _get_leaves_of_error_tree(errors) - grouped_errors = _group_errors_by_json_path(leaf_errors) - grouped_errors = _subset_to_most_specific_json_paths(grouped_errors) - grouped_errors = _deduplicate_errors(grouped_errors) - - # Nothing special about this first error but we need to choose one - # which can be raised - main_error: Any = next(iter(grouped_errors.values()))[0] - # All errors are then attached as a new attribute to ValidationError so that - # they can be used in SchemaValidationError to craft a more helpful - # error message. Setting a new attribute like this is not ideal as - # it then no longer matches the type ValidationError. It would be better - # to refactor this function to never raise but only return errors. - main_error._errors = list(grouped_errors.values()) - if raise_error: - raise main_error - else: - return main_error - else: - return None - - def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: """ Continue an iterator at the last popped ``element``. @@ -215,20 +160,44 @@ def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: yield from others -def lazy_validate_json_schema( +def _regroup(errors: _Errs, /) -> _ErrsLazyGroup: + """ + Regroup error stream with the assumption they are already sorted. + + This holds **only after** all other stages. + """ + for _, grouped_it in groupby(errors, _json_path): + yield grouped_it + + +def validate_jsonschema( spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, ) -> None: - """Lazy equivalent of `validate_jsonschema`.""" - it_errors = _get_errors_from_spec(spec, schema, rootschema=rootschema) + """ + Lazy equivalent of `validate_jsonschema`. + + Validates the passed in spec against the schema in the context of the rootschema. + + If any errors are found, they are deduplicated and prioritized + and only the most relevant errors are kept. + + Nothing special about this first error but we need to choose one + which can be raised + All errors are then attached as a new attribute to ValidationError so that + they can be used in SchemaValidationError to craft a more helpful + error message. Setting a new attribute like this is not ideal as + it then no longer matches the type ValidationError. + """ + it_errors = _iter_errors_from_spec(spec, schema, rootschema=rootschema) if first_error := next(it_errors, None): - groups = _lazy_group_tree_leaves(_rechain(first_error, it_errors)) - most_specific = _lazy_subset_to_most_specific_json_paths(groups) - deduplicated = _lazy_deduplicate_errors(most_specific) + groups = _group_tree_leaves(_rechain(first_error, it_errors)) + most_specific = _prune_subset_paths(groups) + deduplicated = _deduplicate_errors(most_specific) dummy_error: Any if dummy_error := next(deduplicated, None): - dummy_error._errors = _rechain(dummy_error, deduplicated) # type: ignore[attr-defined] + dummy_error._errors = _regroup(_rechain(dummy_error, deduplicated)) # type: ignore[attr-defined] raise dummy_error else: msg = ( @@ -238,11 +207,27 @@ def lazy_validate_json_schema( raise NotImplementedError(msg) -def _get_errors_from_spec( +def validate_jsonschema_fail_fast( + spec: _JsonParameter, + schema: dict[str, Any], + rootschema: dict[str, Any] | None = None, +) -> None: + """ + Raise as quickly as possible. + + Use when any information about the error is not needed. + """ + if ( + err := next(_iter_errors_from_spec(spec, schema, rootschema=rootschema), None) + ) is not None: + raise err + + +def _iter_errors_from_spec( spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, -) -> Iterator[ValidationError]: +) -> _ErrsLazy: """ Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. @@ -322,82 +307,57 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v -# We do not annotate the return value here as the referencing library is not always -# available and this function is only executed in those cases. def _get_referencing_registry( rootschema: dict[str, Any], json_schema_draft_url: str | None = None ) -> Registry: - # Referencing is a dependency of newer jsonschema versions, starting with the - # version that is specified in _use_referencing_library and we therefore - # can expect that it is installed if the function returns True. - # We ignore 'import' mypy errors which happen when the referencing library - # is not installed. That's ok as in these cases this function is not called. - # We also have to ignore 'unused-ignore' errors as mypy raises those in case - # referencing is installed. - import referencing # type: ignore[import,unused-ignore] - import referencing.jsonschema # type: ignore[import,unused-ignore] - - if json_schema_draft_url is None: - json_schema_draft_url = _get_json_schema_draft_url(rootschema) - - specification = referencing.jsonschema.specification_with(json_schema_draft_url) - resource = specification.create_resource(rootschema) - return referencing.Registry().with_resource( - uri=_VEGA_LITE_ROOT_URI, resource=resource - ) + """ + Referencing is a dependency of newer jsonschema versions. + See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 -def _group_errors_by_json_path( - errors: Iterable[ValidationError], -) -> GroupedValidationErrors: - """ - Groups errors by the `json_path` attribute of the jsonschema ValidationError class. + We ignore 'import' ``mypy`` errors which happen when the ``referencing`` library + is not installed. + That's ok as in these cases this function is not called. - This attribute contains the path to the offending element within - a chart specification and can therefore be considered as an identifier of an - 'issue' in the chart that needs to be fixed. + We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case + ``referencing`` is installed. """ - errors_by_json_path = defaultdict(list) - for err in errors: - errors_by_json_path[_json_path(err)].append(err) - return dict(errors_by_json_path) + from referencing import Registry # type: ignore[import,unused-ignore] # noqa: I001 + from referencing.jsonschema import specification_with # type: ignore[import,unused-ignore] + + dialect_id = json_schema_draft_url or _get_json_schema_draft_url(rootschema) + specification = specification_with(dialect_id) + resource = specification.create_resource(rootschema) + return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) -def _get_leaves_of_error_tree( - errors: ValidationErrorList, -) -> ValidationErrorList: +def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: """ - For each error in `errors`, it traverses down the "error tree" that is generated by the jsonschema library to find and return all "leaf" errors. + Combines 3 previously distinct steps: + + 1. ``_get_leaves_of_error_tree`` These are errors which have no further errors that caused it and so they are the most specific errors with the most specific error messages. - """ - leaves: ValidationErrorList = [] - for err in errors: - if err.context: - # This means that the error `err` was caused by errors in subschemas. - # The list of errors from the subschemas are available in the property - # `context`. - leaves.extend(_get_leaves_of_error_tree(err.context)) - else: - leaves.append(err) - return leaves + 2. ``_group_errors_by_json_path`` (part of) -def _lazy_group_tree_leaves( - errors: Iterable[ValidationError], / -) -> Iterator[tuple[str, ValidationError]]: - """ - Combines 3 previously distinct steps: + Extracts the path for grouping. - - ``_get_leaves_of_error_tree`` - - (part of) ``_group_errors_by_json_path`` - - Doesnt actually group yet, can by calling `dict(result)`. - - ``_is_required_value_error`` + 3. Removes:: + + ValidationError: "'value' is a required property" + + as these errors are unlikely to be the relevant ones for the user. + They come from validation against a schema definition where the output of `alt.value` + would be valid. + However, if a user uses `alt.value`, the `value` keyword is included automatically + from that function and so it's unlikely that this was what the user intended + if the keyword is not present in the first place. """ # noqa: D400 for err in errors: if err_context := err.context: - yield from _lazy_group_tree_leaves(err_context) + yield from _group_tree_leaves(err_context) elif err.validator == "required" and err.validator_value == ["value"]: continue else: @@ -410,19 +370,28 @@ def _lazy_group_tree_leaves( """Key function for ``ValidationError.validator``.""" -def _lazy_subset_to_most_specific_json_paths( - json_path_errors: Iterator[tuple[str, ValidationError]], / -) -> Iterator[Iterable[ValidationError]]: +def _message_len(err: ValidationError, /) -> int: + """Return length of a ``ValidationError`` message.""" + return len(err.message) + + +def _prune_subset_paths(json_path_errors: _IntoLazyGroup, /) -> Iterator[_Errs]: """ + Removes key (json path), value (errors) pairs where the json path is fully contained in another json path. + + For example if `errors_by_json_path` has two keys, `$.encoding.X` and `$.encoding.X.tooltip`, + then the first one will be removed and only the second one is returned. + + This is done under the assumption that more specific json paths give more helpful error messages to the user. + Currently using a `list`, but typing it more restrictive to see if it can be avoided. - Needs to be sorted to work with groupby - Reversing allows prioritising more specific groups, since they are seen first - Then re-reversed, to keep seen order - """ rev_sort = sorted(json_path_errors, key=_fn_path, reverse=True) - keeping: dict[str, Iterable[ValidationError]] = {} + keeping: dict[str, _Errs] = {} for unique_path, grouped_errors in groupby(rev_sort, key=_fn_path): if any(seen.startswith(unique_path) for seen in keeping): continue @@ -431,23 +400,35 @@ def _lazy_subset_to_most_specific_json_paths( yield from islice(reversed(keeping.values()), 3) -def _lazy_deduplicate_errors( - grouped_errors: Iterator[Iterable[ValidationError]], / -) -> Iterator[ValidationError]: - for element_errors in grouped_errors: - for validator, errors in groupby( - sorted(element_errors, key=_fn_validator), key=_fn_validator - ): +def _groupby_validator(errors: _Errs, /) -> Iterator[tuple[str, _ErrsLazy]]: + """ + Groups the errors by the json schema "validator" that casued the error. + + For example if the error is that a value is not one of an enumeration in the json schema + then the "validator" is `"enum"`, if the error is due to an unknown property that + was set although no additional properties are allowed then "validator" is + `"additionalProperties`, etc. + """ + yield from groupby(sorted(errors, key=_fn_validator), key=_fn_validator) + + +def _deduplicate_errors(grouped_errors: Iterator[_Errs], /) -> _ErrsLazy: + """ + Some errors have very similar error messages or are just in general not helpful for a user. + + This function removes as many of these cases as possible and + can be extended over time to handle new cases that come up. + """ + for by_path in grouped_errors: + for validator, errors in _groupby_validator(by_path): if validator == "additionalProperties": - errors = _lazy_additional_properties(errors) + errors = _shortest_any_of(errors) elif validator == "enum": - errors = _lazy_deduplicate_enum(errors) - yield from _lazy_unique_message(errors) + errors = _prune_subset_enum(errors) + yield from _distinct_messages(errors) -def _lazy_unique_message( - iterable: Iterable[ValidationError], / -) -> Iterator[ValidationError]: +def _distinct_messages(iterable: _Errs, /) -> _ErrsLazy: seen = set() for el in iterable: if el.message not in seen: @@ -455,9 +436,19 @@ def _lazy_unique_message( yield el -def _lazy_additional_properties( - iterable: Iterable[ValidationError], / -) -> Iterator[ValidationError]: +def _shortest_any_of(iterable: _Errs, /) -> _ErrsLazy: + """ + If there are multiple additional property errors it usually means that the offending element was validated against multiple schemas and its parent is a common anyOf validator. + + The error messages produced from these cases are usually + very similar and we just take the shortest one. + For example the following 3 errors are raised for:: + + alt.X("variety", unknown=2) + - "Additional properties are not allowed ('unknown' was unexpected)" + - "Additional properties are not allowed ('field', 'unknown' were unexpected)" + - "Additional properties are not allowed ('field', 'type', 'unknown' were unexpected)". + """ it = iter(iterable) first = next(it) if ( @@ -468,14 +459,7 @@ def _lazy_additional_properties( yield first -def _message_len(err: ValidationError, /) -> int: - """Return length of a ``ValidationError`` message.""" - return len(err.message) - - -def _lazy_deduplicate_enum( - iterable: Iterable[ValidationError], / -) -> Iterator[ValidationError]: +def _prune_subset_enum(iterable: _Errs, /) -> _ErrsLazy: """Skip any``"enum"`` errors that are a subset of another error.""" enums: tuple[set[str], ...] errors: tuple[ValidationError, ...] @@ -485,157 +469,14 @@ def _lazy_deduplicate_enum( yield err -def _subset_to_most_specific_json_paths( - errors_by_json_path: GroupedValidationErrors, -) -> GroupedValidationErrors: - """ - Removes key (json path), value (errors) pairs where the json path is fully contained in another json path. - - For example if `errors_by_json_path` has two keys, `$.encoding.X` and `$.encoding.X.tooltip`, - then the first one will be removed and only the second one is returned. - - This is done under the assumption that more specific json paths give more helpful error messages to the user. - """ - errors_by_json_path_specific: GroupedValidationErrors = {} - for json_path, errors in errors_by_json_path.items(): - if not _contained_at_start_of_one_of_other_values( - json_path, list(errors_by_json_path.keys()) - ): - errors_by_json_path_specific[json_path] = errors - return errors_by_json_path_specific - - -def _contained_at_start_of_one_of_other_values(x: str, values: Sequence[str]) -> bool: - # Does not count as "contained at start of other value" if the values are - # the same. These cases should be handled separately - return any(value.startswith(x) for value in values if x != value) - - -def _deduplicate_errors( - grouped_errors: GroupedValidationErrors, -) -> GroupedValidationErrors: - """ - Some errors have very similar error messages or are just in general not helpful for a user. - - This function removes as many of these cases as possible and - can be extended over time to handle new cases that come up. - """ - grouped_errors_deduplicated: GroupedValidationErrors = {} - for json_path, element_errors in grouped_errors.items(): - errors_by_validator = _group_errors_by_validator(element_errors) - - deduplication_functions = { - "enum": _deduplicate_enum_errors, - "additionalProperties": _deduplicate_additional_properties_errors, - } - deduplicated_errors: ValidationErrorList = [] - for validator, errors in errors_by_validator.items(): - deduplication_func = deduplication_functions.get(validator) - if deduplication_func is not None: - errors = deduplication_func(errors) - deduplicated_errors.extend(_deduplicate_by_message(errors)) - - # Removes any ValidationError "'value' is a required property" as these - # errors are unlikely to be the relevant ones for the user. They come from - # validation against a schema definition where the output of `alt.value` - # would be valid. However, if a user uses `alt.value`, the `value` keyword - # is included automatically from that function and so it's unlikely - # that this was what the user intended if the keyword is not present - # in the first place. - deduplicated_errors = [ - err for err in deduplicated_errors if not _is_required_value_error(err) - ] - - grouped_errors_deduplicated[json_path] = deduplicated_errors - return grouped_errors_deduplicated - - -def _is_required_value_error(err: ValidationError) -> bool: - return err.validator == "required" and err.validator_value == ["value"] - - -def _group_errors_by_validator(errors: ValidationErrorList) -> GroupedValidationErrors: - """ - Groups the errors by the json schema "validator" that casued the error. - - For example if the error is that a value is not one of an enumeration in the json schema - then the "validator" is `"enum"`, if the error is due to an unknown property that - was set although no additional properties are allowed then "validator" is - `"additionalProperties`, etc. - """ - errors_by_validator: defaultdict[str, ValidationErrorList] = defaultdict(list) - for err in errors: - # Ignore mypy error as err.validator as it wrongly sees err.validator - # as of type Optional[Validator] instead of str which it is according - # to the documentation and all tested cases - errors_by_validator[err.validator].append(err) # type: ignore[index] - return dict(errors_by_validator) - - -def _deduplicate_enum_errors(errors: ValidationErrorList) -> ValidationErrorList: - """ - Deduplicate enum errors by removing the errors where the allowed values are a subset of another error. - - For example, if `enum` contains two errors and one has `validator_value` (i.e. accepted values) ["A", "B"] and the - other one ["A", "B", "C"] then the first one is removed and the final - `enum` list only contains the error with ["A", "B", "C"]. - """ - if len(errors) > 1: - # Values (and therefore `validator_value`) of an enum are always arrays, - # see https://json-schema.org/understanding-json-schema/reference/generic.html#enumerated-values - # which is why we can use join below - value_strings = [",".join(err.validator_value) for err in errors] # type: ignore - longest_enums: ValidationErrorList = [] - for value_str, err in zip(value_strings, errors): - if not _contained_at_start_of_one_of_other_values(value_str, value_strings): - longest_enums.append(err) - errors = longest_enums - return errors - - -def _deduplicate_additional_properties_errors( - errors: ValidationErrorList, -) -> ValidationErrorList: - """ - If there are multiple additional property errors it usually means that the offending element was validated against multiple schemas and its parent is a common anyOf validator. - - The error messages produced from these cases are usually - very similar and we just take the shortest one. For example, - the following 3 errors are raised for the `unknown` channel option in - `alt.X("variety", unknown=2)`: - - "Additional properties are not allowed ('unknown' was unexpected)" - - "Additional properties are not allowed ('field', 'unknown' were unexpected)" - - "Additional properties are not allowed ('field', 'type', 'unknown' were unexpected)". - """ - if len(errors) > 1: - # Test if all parent errors are the same anyOf error and only do - # the prioritization in these cases. Can't think of a chart spec where this - # would not be the case but still allow for it below to not break anything. - parent = errors[0].parent - if ( - parent is not None - and parent.validator == "anyOf" - # Use [1:] as don't have to check for first error as it was used - # above to define `parent` - and all(err.parent is parent for err in errors[1:]) - ): - errors = [min(errors, key=lambda x: len(x.message))] - return errors - - -def _deduplicate_by_message(errors: ValidationErrorList) -> ValidationErrorList: - """Deduplicate errors by message. This keeps the original order in case it was chosen intentionally.""" - return list({e.message: e for e in errors}.values()) - - def _subclasses(cls: type[Any]) -> Iterator[type[Any]]: """Breadth-first sequence of all classes which inherit from cls.""" seen = set() - current_set = {cls} - while current_set: - seen |= current_set - current_set = set.union(*(set(cls.__subclasses__()) for cls in current_set)) - for cls in current_set - seen: + current: set[type[Any]] = {cls} + while current: + seen |= current + current = set(chain.from_iterable(cls.__subclasses__() for cls in current)) + for cls in current - seen: yield cls @@ -714,7 +555,7 @@ def __init__(self, obj: SchemaBase, err: ValidationError) -> None: super().__init__(**err._contents()) self.obj = obj err = cast("SchemaValidationError", err) - self._errors: Iterable[ValidationError] = err._errors + self._errors: _ErrsLazyGroup = err._errors # This is the message from err self._original_message = self.message self.message = self._get_message() @@ -722,49 +563,47 @@ def __init__(self, obj: SchemaBase, err: ValidationError) -> None: def __str__(self) -> str: return self.message + @staticmethod + def indent_from_second_line(msg: str, /, indent: int = 4) -> str: + return "\n".join( + " " * indent + s if idx > 0 and s else s + for idx, s in enumerate(msg.split("\n")) + ) + def _get_message(self) -> str: - def indent_second_line_onwards(message: str, indent: int = 4) -> str: - modified_lines: list[str] = [] - for idx, line in enumerate(message.split("\n")): - if idx > 0 and len(line) > 0: - line = " " * indent + line - modified_lines.append(line) - return "\n".join(modified_lines) - - error_messages: list[str] = [] - # Only show a maximum of 3 errors as else the final message returned by this - # method could get very long. - # ^^^^^^^^^^ - # CORRECTION: Only show 3 **json_paths** - - for errors in _group_errors_by_json_path(self._errors).values(): - error_messages.append(self._get_message_for_errors_group(errors)) - - message = "" - if len(error_messages) > 1: - error_messages = [ - indent_second_line_onwards(f"Error {error_id}: {m}") + it = self._errors + group_1 = list(next(it)) + if (group_2 := next(it, None)) is not None: + error_messages = [] + for group in group_1, list(group_2), next(it, None): + if group is not None: + error_messages.append(self._get_message_for_errors_group(group)) + message = "\n\n".join( + self.indent_from_second_line(f"Error {error_id}: {m}") for error_id, m in enumerate(error_messages, start=1) - ] - message += "Multiple errors were found.\n\n" - message += "\n\n".join(error_messages) - return message + ) + return f"Multiple errors were found.\n\n{message}" + else: + return self._get_message_for_errors_group(group_1) - def _get_message_for_errors_group( - self, - errors: ValidationErrorList, - ) -> str: + def _get_message_for_errors_group(self, errors: _Errs) -> str: + """ + Note. + + During development, we only found cases where an additionalProperties + error was raised if that was the only error for the offending instance + as identifiable by the json path. + + Therefore, we just check here the first error. + However, other constellations might exist in which case this should be adapted + so that other error messages are shown as well. + """ + if not isinstance(errors, Sequence): + errors = list(errors) if errors[0].validator == "additionalProperties": - # During development, we only found cases where an additionalProperties - # error was raised if that was the only error for the offending instance - # as identifiable by the json path. Therefore, we just check here the first - # error. However, other constellations might exist in which case - # this should be adapted so that other error messages are shown as well. - message = self._get_additional_properties_error_message(errors[0]) + return self._get_additional_properties_error_message(errors[0]) else: - message = self._get_default_error_message(errors=errors) - - return message.strip() + return self._get_default_error_message(errors=errors) def _get_additional_properties_error_message( self, @@ -779,13 +618,12 @@ def _get_additional_properties_error_message( # "Additional properties are not allowed ('unknown' was unexpected)" # Line below extracts "unknown" from this string parameter_name = error.message.split("('")[-1].split("'")[0] - message = f"""\ -`{altair_cls.__name__}` has no parameter named '{parameter_name}' - -Existing parameter names are: -{param_names_table} -See the help for `{altair_cls.__name__}` to read the full description of these parameters""" - return message + cls_name = altair_cls.__name__ + return ( + f"`{cls_name}` has no parameter named '{parameter_name}'\n\n" + f"Existing parameter names are:\n{param_names_table}\n" + f"See the help for `{cls_name}` to read the full description of these parameters" + ) def _get_altair_class_for_error(self, error: ValidationError) -> type[SchemaBase]: """ @@ -793,6 +631,8 @@ def _get_altair_class_for_error(self, error: ValidationError) -> type[SchemaBase This should lead to more informative error messages pointing the user closer to the source of the issue. """ + from altair import vegalite + for prop_name in reversed(error.absolute_path): # Check if str as e.g. first item can be a 0 if isinstance(prop_name, str): @@ -804,24 +644,17 @@ def _get_altair_class_for_error(self, error: ValidationError) -> type[SchemaBase # Did not find a suitable class based on traversing the path so we fall # back on the class of the top-level object which created # the SchemaValidationError - cls = self.obj.__class__ + cls = type(self.obj) return cls @staticmethod - def _format_params_as_table(param_dict_keys: Iterable[str]) -> str: + def _format_params_as_table(param_view: KeysView[str]) -> str: """Format param names into a table so that they are easier to read.""" - param_names: tuple[str, ...] - name_lengths: tuple[int, ...] - param_names, name_lengths = zip( - *[ - (name, len(name)) - for name in param_dict_keys - if name not in {"kwds", "self"} - ] - ) + param_names: list[str] = [nm for nm in param_view if nm not in {"kwds", "self"}] + # Worst case scenario with the same longest param name in the same # row for all columns - max_name_length = max(name_lengths) + max_name_length = len(max(param_view, key=len)) max_column_width = 80 # Output a square table if not too big (since it is easier to read) num_param_names = len(param_names) @@ -835,7 +668,7 @@ def split_into_equal_parts(n: int, p: int) -> list[int]: column_heights = split_into_equal_parts(num_param_names, columns) # Section the param names into columns and compute their widths - param_names_columns: list[tuple[str, ...]] = [] + param_names_columns: list[Sequence[str]] = [] column_max_widths: list[int] = [] last_end_idx: int = 0 for ch in column_heights: @@ -846,30 +679,29 @@ def split_into_equal_parts(n: int, p: int) -> list[int]: last_end_idx = ch + last_end_idx # Transpose the param name columns into rows to facilitate looping - param_names_rows: list[tuple[str, ...]] = [] - for li in zip_longest(*param_names_columns, fillvalue=""): - param_names_rows.append(li) # Build the table as a string by iterating over and formatting the rows param_names_table: str = "" - for param_names_row in param_names_rows: + column_pad = 3 + for param_names_row in zip_longest(*param_names_columns, fillvalue=""): + last_element = len(param_names_row) - 1 for num, param_name in enumerate(param_names_row): # Set column width based on the longest param in the column - max_name_length_column = column_max_widths[num] - column_pad = 3 - param_names_table += "{:<{}}".format( - param_name, max_name_length_column + column_pad - ) + width = column_pad + column_max_widths[num] + param_names_table += "{:<{}}".format(param_name, width) # Insert newlines and spacing after the last element in each row - if num == (len(param_names_row) - 1): + if num == last_element: param_names_table += "\n" return param_names_table def _get_default_error_message( self, - errors: ValidationErrorList, + errors: Sequence[ValidationError], ) -> str: bullet_points: list[str] = [] - errors_by_validator = _group_errors_by_validator(errors) + errors_by_validator: defaultdict[str, list[ValidationError]] = defaultdict(list) + for err in errors: + errors_by_validator[err.validator].append(err) # type: ignore[index] + if "enum" in errors_by_validator: for error in errors_by_validator["enum"]: bullet_points.append(f"one of {error.validator_value}") @@ -917,7 +749,7 @@ def _get_default_error_message( if validator not in {"enum", "type"} ) message += "".join(it) - return message + return message.strip() class UndefinedType: @@ -1213,7 +1045,7 @@ def to_dict( if validate: try: self.validate(result) - except jsonschema.ValidationError as err: + except ValidationError as err: # We do not raise `from err` as else the resulting # traceback is very long as it contains part # of the Vega-Lite schema. It would also first @@ -1338,12 +1170,8 @@ def validate( cls, instance: dict[str, Any], schema: dict[str, Any] | None = None ) -> None: """Validate the instance against the class schema in the context of the rootschema.""" - if schema is None: - schema = cls._schema - # For the benefit of mypy - assert schema is not None - lazy_validate_json_schema( - instance, schema, rootschema=cls._rootschema or cls._schema + validate_jsonschema( + instance, schema or cls._schema, cls._rootschema or cls._schema ) @classmethod @@ -1370,7 +1198,7 @@ def validate_property( np_opt = sys.modules.get("numpy") value = _todict(value, context={}, np_opt=np_opt, pd_opt=pd_opt) props = cls.resolve_references(schema or cls._schema).get("properties", {}) - lazy_validate_json_schema( + validate_jsonschema( value, props.get(name, {}), rootschema=cls._rootschema or cls._schema ) @@ -1406,6 +1234,17 @@ def _passthrough(*args: Any, **kwds: Any) -> Any | dict[str, Any]: return args[0] if args else kwds +def _freeze(val): + if isinstance(val, dict): + return frozenset((k, _freeze(v)) for k, v in val.items()) + elif isinstance(val, set): + return frozenset(_freeze(v) for v in val) + elif isinstance(val, (list, tuple)): + return tuple(_freeze(v) for v in val) + else: + return val + + class _FromDict: """ Class used to construct SchemaBase class hierarchies from a dict. @@ -1443,22 +1282,8 @@ def hash_schema(cls, schema: dict[str, Any], use_json: bool = True) -> int: for key, val in schema.items() if key not in cls._hash_exclude_keys } - if use_json: - s = json.dumps(schema, sort_keys=True) - return hash(s) - else: - - def _freeze(val): - if isinstance(val, dict): - return frozenset((k, _freeze(v)) for k, v in val.items()) - elif isinstance(val, set): - return frozenset(map(_freeze, val)) - elif isinstance(val, (list, tuple)): - return tuple(map(_freeze, val)) - else: - return val - - return hash(_freeze(schema)) + s: Any = json.dumps(schema, sort_keys=True) if use_json else _freeze(schema) + return hash(s) @overload def from_dict( @@ -1541,8 +1366,8 @@ def from_dict( schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) for possible in schemas: try: - lazy_validate_json_schema(dct, possible, rootschema=root_schema) - except jsonschema.ValidationError: + validate_jsonschema_fail_fast(dct, possible, rootschema=root_schema) + except ValidationError: continue else: return from_dict(dct, schema=possible, default_class=target_tp) @@ -1569,6 +1394,8 @@ def __init__(self, prop: str, schema: dict[str, Any]) -> None: self.schema = schema def __get__(self, obj, cls): + from altair import vegalite + self.obj = obj self.cls = cls # The docs from the encoding class parameter (e.g. `bin` in X, Color, From 5483db8c8bc009f405f288988c40bb54bb7ea312 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:30:29 +0100 Subject: [PATCH 18/92] test(perf): Adds `test_chart_validation_benchmark` Temporary, will remove before review. Tried to isolate to a single function so that I can reproduce on main --- tests/utils/test_schemapi.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 7ae96a864..a73994163 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -871,6 +871,39 @@ def test_chart_validation_errors(chart_func, expected_error_message): chart.to_dict() +_SKIP_SLOW_BENCHMARKS: bool = False + + +@pytest.mark.skipif( + _SKIP_SLOW_BENCHMARKS, + reason="Should only be run in isolation to test single threaded performance.", +) +def test_chart_validation_benchmark() -> None: + """ + Intended to isolate the `to_dict` call. + + Repeated ``1000`` times, non-parametric: + - in an attempt to limit the potential overhead of ``pytest`` + - but enforce ``1`` thread, like a user-code would be. + """ + if TYPE_CHECKING: + from typing import Iterator + + from altair.typing import ChartType + + def _iter_charts(*, times: int) -> Iterator[ChartType]: + from itertools import chain, repeat + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + charts: list[ChartType] = [fn() for fn, _ in chart_funcs_error_message] + yield from chain.from_iterable(repeat(charts, times=times)) + + for chart in _iter_charts(times=1000): + with pytest.raises(SchemaValidationError): + chart.to_dict(validate=True) + + def test_multiple_field_strings_in_condition(): selection = alt.selection_point() expected_error_message = "A field cannot be used for both the `if_true` and `if_false` values of a condition. One of them has to specify a `value` or `datum` definition." From f208066c7037ec89c2e0bcd9e12e0cd661eaf0ff Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:40:11 +0100 Subject: [PATCH 19/92] fix: Invert default for `_SKIP_SLOW_BENCHMARKS` I renamed this from `_SLOW_BENCHMARKS` but forgot to invert the bool lol --- tests/utils/test_schemapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 8526faf3c..231b60d74 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -875,7 +875,7 @@ def test_chart_validation_errors(chart_func, expected_error_message): chart.to_dict() -_SKIP_SLOW_BENCHMARKS: bool = False +_SKIP_SLOW_BENCHMARKS: bool = True @pytest.mark.skipif( From 996ea97134566d4ead1629460c0bd7acdb77d24a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:08:49 +0100 Subject: [PATCH 20/92] refactor: Parameterize `_regroup` and improve doc --- tools/schemapi/schemapi.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 57bf63b23..7acf76d82 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -160,13 +160,15 @@ def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: yield from others -def _regroup(errors: _Errs, /) -> _ErrsLazyGroup: +def _regroup( + errors: _Errs, /, *, key: Callable[[ValidationError], str] = _json_path +) -> _ErrsLazyGroup: """ - Regroup error stream with the assumption they are already sorted. + Regroup error stream by a ``key`` function. - This holds **only after** all other stages. + Assumes ``errors`` are already sorted, which holds **only** at the end of ``validate_jsonschema``. """ - for _, grouped_it in groupby(errors, _json_path): + for _, grouped_it in groupby(errors, key): yield grouped_it From 5751132a63af6b6b1f67be5a4d636bb345ae112e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:19:38 +0100 Subject: [PATCH 21/92] docs: Update `validate_jsonschema`/`_fail_fast` --- tools/schemapi/schemapi.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 7acf76d82..953717b11 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -178,19 +178,16 @@ def validate_jsonschema( rootschema: dict[str, Any] | None = None, ) -> None: """ - Lazy equivalent of `validate_jsonschema`. + Validates ``spec`` against ``schema`` in the context of ``rootschema``. - Validates the passed in spec against the schema in the context of the rootschema. + Any ``ValidationError``(s) are deduplicated and prioritized, with + the remaining errors deemed relevant to the user. - If any errors are found, they are deduplicated and prioritized - and only the most relevant errors are kept. - - Nothing special about this first error but we need to choose one - which can be raised - All errors are then attached as a new attribute to ValidationError so that - they can be used in SchemaValidationError to craft a more helpful - error message. Setting a new attribute like this is not ideal as - it then no longer matches the type ValidationError. + Notes + ----- + - The first error is monkeypatched with a grouped iterator of all remaining errors + - ``SchemaValidationError`` utilizes the patched attribute, to craft a more helpful error message. + - However this breaks typing """ it_errors = _iter_errors_from_spec(spec, schema, rootschema=rootschema) if first_error := next(it_errors, None): @@ -217,7 +214,7 @@ def validate_jsonschema_fail_fast( """ Raise as quickly as possible. - Use when any information about the error is not needed. + Use instead of ``validate_jsonschema`` when any information about the error(s) are not needed. """ if ( err := next(_iter_errors_from_spec(spec, schema, rootschema=rootschema), None) From cb1fa24e468309be8a68d580ec100585fc2acb10 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 21:16:01 +0100 Subject: [PATCH 22/92] refactor: Use more constants, rename, reorder --- tools/schemapi/schemapi.py | 63 +++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 953717b11..4c14dab7f 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -35,7 +35,7 @@ from packaging.version import Version if TYPE_CHECKING: - from typing import ClassVar + from typing import ClassVar, Literal, Mapping from jsonschema.protocols import Validator, _JsonParameter from referencing import Registry @@ -59,6 +59,19 @@ _ErrsLazy: TypeAlias = Iterator[ValidationError] _ErrsLazyGroup: TypeAlias = Iterator[_ErrsLazy] _IntoLazyGroup: TypeAlias = Iterator["tuple[str, ValidationError]"] + _ValidatorKeyword: TypeAlias = Literal[ + "additionalProperties", + "enum", + "type", + "required", + "properties", + "anyOf", + "allOf", + "oneOf", + "ref", + "const", + ] + """Non-exhaustive listing of possible literals in ``ValidationError.validator``""" _VEGA_LITE_ROOT_URI: Final = "urn:vega-lite-schema" @@ -330,6 +343,17 @@ def _get_referencing_registry( return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) +_FN_PATH = cast("Callable[[tuple[str, ValidationError]], str]", operator.itemgetter(0)) +"""Key function for ``(json_path, ValidationError)``.""" +_FN_VALIDATOR = cast("Callable[[ValidationError], _ValidatorKeyword]", operator.attrgetter("validator")) # fmt: off +"""Key function for ``ValidationError.validator``.""" + + +def _message_len(err: ValidationError, /) -> int: + """Return length of a ``ValidationError`` message.""" + return len(err.message) + + def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: """ Combines 3 previously distinct steps: @@ -354,26 +378,17 @@ def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: from that function and so it's unlikely that this was what the user intended if the keyword is not present in the first place. """ # noqa: D400 + REQUIRED = "required" + VALUE = ["value"] for err in errors: if err_context := err.context: yield from _group_tree_leaves(err_context) - elif err.validator == "required" and err.validator_value == ["value"]: + elif err.validator == REQUIRED and err.validator_value == VALUE: continue else: yield _json_path(err), err -_fn_path = cast("Callable[[tuple[str, ValidationError]], str]", operator.itemgetter(0)) -"""Key function for ``(json_path, ValidationError)``.""" -_fn_validator = cast("Callable[[ValidationError], str]", operator.attrgetter("validator")) # fmt: off -"""Key function for ``ValidationError.validator``.""" - - -def _message_len(err: ValidationError, /) -> int: - """Return length of a ``ValidationError`` message.""" - return len(err.message) - - def _prune_subset_paths(json_path_errors: _IntoLazyGroup, /) -> Iterator[_Errs]: """ Removes key (json path), value (errors) pairs where the json path is fully contained in another json path. @@ -389,9 +404,9 @@ def _prune_subset_paths(json_path_errors: _IntoLazyGroup, /) -> Iterator[_Errs]: - Reversing allows prioritising more specific groups, since they are seen first - Then re-reversed, to keep seen order """ - rev_sort = sorted(json_path_errors, key=_fn_path, reverse=True) + rev_sort = sorted(json_path_errors, key=_FN_PATH, reverse=True) keeping: dict[str, _Errs] = {} - for unique_path, grouped_errors in groupby(rev_sort, key=_fn_path): + for unique_path, grouped_errors in groupby(rev_sort, key=_FN_PATH): if any(seen.startswith(unique_path) for seen in keeping): continue else: @@ -399,7 +414,9 @@ def _prune_subset_paths(json_path_errors: _IntoLazyGroup, /) -> Iterator[_Errs]: yield from islice(reversed(keeping.values()), 3) -def _groupby_validator(errors: _Errs, /) -> Iterator[tuple[str, _ErrsLazy]]: +def _groupby_validator( + errors: _Errs, / +) -> Iterator[tuple[_ValidatorKeyword, _ErrsLazy]]: """ Groups the errors by the json schema "validator" that casued the error. @@ -408,7 +425,7 @@ def _groupby_validator(errors: _Errs, /) -> Iterator[tuple[str, _ErrsLazy]]: was set although no additional properties are allowed then "validator" is `"additionalProperties`, etc. """ - yield from groupby(sorted(errors, key=_fn_validator), key=_fn_validator) + yield from groupby(sorted(errors, key=_FN_VALIDATOR), key=_FN_VALIDATOR) def _deduplicate_errors(grouped_errors: Iterator[_Errs], /) -> _ErrsLazy: @@ -420,10 +437,8 @@ def _deduplicate_errors(grouped_errors: Iterator[_Errs], /) -> _ErrsLazy: """ for by_path in grouped_errors: for validator, errors in _groupby_validator(by_path): - if validator == "additionalProperties": - errors = _shortest_any_of(errors) - elif validator == "enum": - errors = _prune_subset_enum(errors) + if fn := _FN_MAP_DEDUPLICATION.get(validator): + errors = fn(errors) yield from _distinct_messages(errors) @@ -468,6 +483,12 @@ def _prune_subset_enum(iterable: _Errs, /) -> _ErrsLazy: yield err +_FN_MAP_DEDUPLICATION: Mapping[_ValidatorKeyword, Callable[[_Errs], _ErrsLazy]] = { + "additionalProperties": _shortest_any_of, + "enum": _prune_subset_enum, +} + + def _subclasses(cls: type[Any]) -> Iterator[type[Any]]: """Breadth-first sequence of all classes which inherit from cls.""" seen = set() From 09c83ae216819003b996363289ae3322ef2d9960 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 21:17:23 +0100 Subject: [PATCH 23/92] docs: Tweak `_group_tree_leaves` --- tools/schemapi/schemapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 4c14dab7f..6d2e161b4 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -365,7 +365,7 @@ def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: 2. ``_group_errors_by_json_path`` (part of) - Extracts the path for grouping. + Extracts the ``.json_path`` property for grouping. 3. Removes:: From a0a897833091c57f539a6be9ad443350926b8907 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 21:22:39 +0100 Subject: [PATCH 24/92] refactor(perf): Conditionally define compatibility code Rather than checking a function/constant, a single set of behaviour is defined **once** - depending on `jsonschema` version. This makes the remaining functionality much easier to reason with. Also easier to avoid typing issues --- tools/schemapi/schemapi.py | 296 +++++++++++++++++++------------------ 1 file changed, 151 insertions(+), 145 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 6d2e161b4..465c5681b 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -109,32 +109,6 @@ class Derived(SchemaBase): _class_is_valid_at_instantiation: ClassVar[bool] = False """ -_JSONSCHEMA_VERSION = Version(importlib_version("jsonschema")) -_USING_REFERENCING: Final[bool] = _JSONSCHEMA_VERSION >= Version("4.18") # noqa: SIM300 -""" -``jsonschema`` deprecated ``RefResolver`` in favor of ``referencing``. - -See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 -""" - -if _JSONSCHEMA_VERSION >= Version("4.0.1"): # noqa: SIM300 - _json_path: Callable[[ValidationError], str] = operator.attrgetter("json_path") -else: - - def _json_path(err: ValidationError, /) -> str: - """ - Vendored backport for ``jsonschema.ValidationError.json_path`` property. - - See https://github.com/vega/altair/issues/3038. - """ - path = "$" - for elem in err.absolute_path: - if isinstance(elem, int): - path += "[" + str(elem) + "]" - else: - path += "." + elem - return path - def enable_debug_mode() -> None: global DEBUG_MODE @@ -157,34 +131,6 @@ def debug_mode(arg: bool) -> Iterator[None]: DEBUG_MODE = original -def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: - """ - Continue an iterator at the last popped ``element``. - - Equivalent to:: - - elements = 1, 2, 3, 4, 5 - it = iter(elements) - element = next(it) - it_continue = chain([element], it) - - """ - yield element - yield from others - - -def _regroup( - errors: _Errs, /, *, key: Callable[[ValidationError], str] = _json_path -) -> _ErrsLazyGroup: - """ - Regroup error stream by a ``key`` function. - - Assumes ``errors`` are already sorted, which holds **only** at the end of ``validate_jsonschema``. - """ - for _, grouped_it in groupby(errors, key): - yield grouped_it - - def validate_jsonschema( spec: _JsonParameter, schema: dict[str, Any], @@ -202,7 +148,7 @@ def validate_jsonschema( - ``SchemaValidationError`` utilizes the patched attribute, to craft a more helpful error message. - However this breaks typing """ - it_errors = _iter_errors_from_spec(spec, schema, rootschema=rootschema) + it_errors = _iter_validator_errors(spec, schema, rootschema=rootschema) if first_error := next(it_errors, None): groups = _group_tree_leaves(_rechain(first_error, it_errors)) most_specific = _prune_subset_paths(groups) @@ -230,60 +176,11 @@ def validate_jsonschema_fail_fast( Use instead of ``validate_jsonschema`` when any information about the error(s) are not needed. """ if ( - err := next(_iter_errors_from_spec(spec, schema, rootschema=rootschema), None) + err := next(_iter_validator_errors(spec, schema, rootschema=rootschema), None) ) is not None: raise err -def _iter_errors_from_spec( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, -) -> _ErrsLazy: - """ - Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. - - ``schema`` and ``rootschema`` are not validated but instead considered as valid. - - We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. - Instead, we pass the ``schema`` directly to the validator class. - - This is done for two reasons: - - 1. The schema comes from Vega-Lite and is not based on the user - input, therefore there is no need to validate it in the first place. - 2. The "uri-reference" format checker fails for some of the - references as URIs in "$ref" are not encoded, e.g.: - - '#/definitions/ValueDefWithCondition' - - would be a valid $ref in a Vega-Lite schema but it is not a valid - URI reference due to the characters such as '<'. - """ - json_schema_draft_url = _get_json_schema_draft_url(rootschema or schema) - validator_cls: type[Validator] = cast( - "type[Validator]", - jsonschema.validators.validator_for({"$schema": json_schema_draft_url}), - ) - validator_kwargs: dict[str, Any] = {} - if hasattr(validator_cls, "FORMAT_CHECKER"): - validator_kwargs["format_checker"] = validator_cls.FORMAT_CHECKER - - if _USING_REFERENCING: - schema = _prepare_references(schema) - validator_kwargs["registry"] = _get_referencing_registry( - rootschema or schema, json_schema_draft_url - ) - else: - # No resolver is necessary if the schema is already the full schema - validator_kwargs["resolver"] = ( - jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema - ) - - validator = validator_cls(schema, **validator_kwargs) - return validator.iter_errors(spec) - - def _get_json_schema_draft_url(schema: dict[str, Any]) -> str: return schema.get("$schema", _DEFAULT_JSON_SCHEMA_DRAFT_URL) @@ -319,28 +216,103 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v -def _get_referencing_registry( - rootschema: dict[str, Any], json_schema_draft_url: str | None = None -) -> Registry: - """ - Referencing is a dependency of newer jsonschema versions. +def _prepare_validator(url: str, /) -> Callable[..., Validator]: + tp = cast( + "Callable[..., Validator]", + jsonschema.validators.validator_for({"$schema": url}), + ) + if hasattr(tp, "FORMAT_CHECKER"): + return partial(tp, format_checker=tp.FORMAT_CHECKER) + else: + return tp + + +if Version(importlib_version("jsonschema")) >= Version("4.18"): + from referencing import Registry + from referencing.jsonschema import specification_with + + def _construct_validator( + schema: dict[str, Any], rootschema: dict[str, Any] | None = None + ) -> Validator: + url = _get_json_schema_draft_url(rootschema or schema) + tp = _prepare_validator(url) + registry = _get_referencing_registry(rootschema or schema, url) + return tp(_prepare_references(schema), registry=registry) + + def _get_referencing_registry( + rootschema: dict[str, Any], json_schema_draft_url: str | None = None + ) -> Registry[Any]: + """ + Referencing is a dependency of newer jsonschema versions. - See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 - We ignore 'import' ``mypy`` errors which happen when the ``referencing`` library - is not installed. - That's ok as in these cases this function is not called. + We ignore 'import' ``mypy`` errors which happen when the ``referencing`` library + is not installed. + That's ok as in these cases this function is not called. + + We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case + ``referencing`` is installed. + """ + dialect_id = json_schema_draft_url or _get_json_schema_draft_url(rootschema) + specification = specification_with(dialect_id) + resource = specification.create_resource(rootschema) + return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) + + def _resolve_references( + schema: dict[str, Any], rootschema: dict[str, Any] | None = None + ) -> dict[str, Any]: + """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" + registry = _get_referencing_registry(rootschema or schema) + resolver = registry.resolver() + while "$ref" in schema: + schema = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]).contents + return schema +else: + + def _construct_validator( + schema: dict[str, Any], rootschema: dict[str, Any] | None = None + ) -> Validator: + tp = _prepare_validator(_get_json_schema_draft_url(rootschema or schema)) + resolver: Any = ( + jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema + ) + return tp(schema, resolver=resolver) + + def _resolve_references( + schema: dict[str, Any], rootschema: dict[str, Any] | None = None + ) -> dict[str, Any]: + """ + Resolve schema references until there is no $ref anymore in the top-level of the dictionary. + + ``jsonschema`` deprecated ``RefResolver`` in favor of ``referencing``. + + See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + """ + resolver = jsonschema.RefResolver.from_schema(rootschema or schema) + while "$ref" in schema: + with resolver.resolving(schema["$ref"]) as resolved: + schema = resolved + return schema - We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case - ``referencing`` is installed. - """ - from referencing import Registry # type: ignore[import,unused-ignore] # noqa: I001 - from referencing.jsonschema import specification_with # type: ignore[import,unused-ignore] - dialect_id = json_schema_draft_url or _get_json_schema_draft_url(rootschema) - specification = specification_with(dialect_id) - resource = specification.create_resource(rootschema) - return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) +if Version(importlib_version("jsonschema")) >= Version("4.0.1"): + _json_path: Callable[[ValidationError], str] = operator.attrgetter("json_path") +else: + + def _json_path(err: ValidationError, /) -> str: + """ + Vendored backport for ``jsonschema.ValidationError.json_path`` property. + + See https://github.com/vega/altair/issues/3038. + """ + path = "$" + for elem in err.absolute_path: + if isinstance(elem, int): + path += "[" + str(elem) + "]" + else: + path += "." + elem + return path _FN_PATH = cast("Callable[[tuple[str, ValidationError]], str]", operator.itemgetter(0)) @@ -354,6 +326,62 @@ def _message_len(err: ValidationError, /) -> int: return len(err.message) +def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: + """ + Continue an iterator at the last popped ``element``. + + Equivalent to:: + + elements = 1, 2, 3, 4, 5 + it = iter(elements) + element = next(it) + it_continue = chain([element], it) + + """ + yield element + yield from others + + +def _regroup( + errors: _Errs, /, *, key: Callable[[ValidationError], str] = _json_path +) -> _ErrsLazyGroup: + """ + Regroup error stream by a ``key`` function. + + Assumes ``errors`` are already sorted, which holds **only** at the end of ``validate_jsonschema``. + """ + for _, grouped_it in groupby(errors, key): + yield grouped_it + + +def _iter_validator_errors( + spec: _JsonParameter, + schema: dict[str, Any], + rootschema: dict[str, Any] | None = None, +) -> _ErrsLazy: + """ + Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. + + ``schema`` and ``rootschema`` are not validated but instead considered as valid. + + We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. + Instead, we pass the ``schema`` directly to the validator class. + + This is done for two reasons: + + 1. The schema comes from Vega-Lite and is not based on the user + input, therefore there is no need to validate it in the first place. + 2. The "uri-reference" format checker fails for some of the + references as URIs in "$ref" are not encoded, e.g.: + + '#/definitions/ValueDefWithCondition' + + would be a valid $ref in a Vega-Lite schema but it is not a valid + URI reference due to the characters such as '<'. + """ + return _construct_validator(schema, rootschema).iter_errors(spec) + + def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: """ Combines 3 previously distinct steps: @@ -546,28 +574,6 @@ def _todict(obj: Any, context: dict[str, Any] | None, np_opt: Any, pd_opt: Any) return obj -def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None -) -> dict[str, Any]: - """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" - if _USING_REFERENCING: - registry = _get_referencing_registry(rootschema or schema) - # Using a different variable name to show that this is not the - # jsonschema.RefResolver but instead a Resolver from the referencing - # library - referencing_resolver = registry.resolver() - while "$ref" in schema: - schema = referencing_resolver.lookup( - _VEGA_LITE_ROOT_URI + schema["$ref"] - ).contents - else: - resolver = jsonschema.RefResolver.from_schema(rootschema or schema) - while "$ref" in schema: - with resolver.resolving(schema["$ref"]) as resolved: - schema = resolved - return schema - - class SchemaValidationError(jsonschema.ValidationError): """A wrapper for jsonschema.ValidationError with friendlier traceback.""" From 2e8159261cfbe528c87d1f58c5641412c44bcd6a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 21 Aug 2024 21:24:36 +0100 Subject: [PATCH 25/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 376 +++++++++++++++++++++------------------ 1 file changed, 201 insertions(+), 175 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 248193a85..ef477e769 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -37,7 +37,7 @@ from packaging.version import Version if TYPE_CHECKING: - from typing import ClassVar + from typing import ClassVar, Literal, Mapping from jsonschema.protocols import Validator, _JsonParameter from referencing import Registry @@ -61,6 +61,19 @@ _ErrsLazy: TypeAlias = Iterator[ValidationError] _ErrsLazyGroup: TypeAlias = Iterator[_ErrsLazy] _IntoLazyGroup: TypeAlias = Iterator["tuple[str, ValidationError]"] + _ValidatorKeyword: TypeAlias = Literal[ + "additionalProperties", + "enum", + "type", + "required", + "properties", + "anyOf", + "allOf", + "oneOf", + "ref", + "const", + ] + """Non-exhaustive listing of possible literals in ``ValidationError.validator``""" _VEGA_LITE_ROOT_URI: Final = "urn:vega-lite-schema" @@ -98,32 +111,6 @@ class Derived(SchemaBase): _class_is_valid_at_instantiation: ClassVar[bool] = False """ -_JSONSCHEMA_VERSION = Version(importlib_version("jsonschema")) -_USING_REFERENCING: Final[bool] = _JSONSCHEMA_VERSION >= Version("4.18") # noqa: SIM300 -""" -``jsonschema`` deprecated ``RefResolver`` in favor of ``referencing``. - -See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 -""" - -if _JSONSCHEMA_VERSION >= Version("4.0.1"): # noqa: SIM300 - _json_path: Callable[[ValidationError], str] = operator.attrgetter("json_path") -else: - - def _json_path(err: ValidationError, /) -> str: - """ - Vendored backport for ``jsonschema.ValidationError.json_path`` property. - - See https://github.com/vega/altair/issues/3038. - """ - path = "$" - for elem in err.absolute_path: - if isinstance(elem, int): - path += "[" + str(elem) + "]" - else: - path += "." + elem - return path - def enable_debug_mode() -> None: global DEBUG_MODE @@ -146,53 +133,24 @@ def debug_mode(arg: bool) -> Iterator[None]: DEBUG_MODE = original -def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: - """ - Continue an iterator at the last popped ``element``. - - Equivalent to:: - - elements = 1, 2, 3, 4, 5 - it = iter(elements) - element = next(it) - it_continue = chain([element], it) - - """ - yield element - yield from others - - -def _regroup(errors: _Errs, /) -> _ErrsLazyGroup: - """ - Regroup error stream with the assumption they are already sorted. - - This holds **only after** all other stages. - """ - for _, grouped_it in groupby(errors, _json_path): - yield grouped_it - - def validate_jsonschema( spec: _JsonParameter, schema: dict[str, Any], rootschema: dict[str, Any] | None = None, ) -> None: """ - Lazy equivalent of `validate_jsonschema`. + Validates ``spec`` against ``schema`` in the context of ``rootschema``. - Validates the passed in spec against the schema in the context of the rootschema. + Any ``ValidationError``(s) are deduplicated and prioritized, with + the remaining errors deemed relevant to the user. - If any errors are found, they are deduplicated and prioritized - and only the most relevant errors are kept. - - Nothing special about this first error but we need to choose one - which can be raised - All errors are then attached as a new attribute to ValidationError so that - they can be used in SchemaValidationError to craft a more helpful - error message. Setting a new attribute like this is not ideal as - it then no longer matches the type ValidationError. + Notes + ----- + - The first error is monkeypatched with a grouped iterator of all remaining errors + - ``SchemaValidationError`` utilizes the patched attribute, to craft a more helpful error message. + - However this breaks typing """ - it_errors = _iter_errors_from_spec(spec, schema, rootschema=rootschema) + it_errors = _iter_validator_errors(spec, schema, rootschema=rootschema) if first_error := next(it_errors, None): groups = _group_tree_leaves(_rechain(first_error, it_errors)) most_specific = _prune_subset_paths(groups) @@ -217,63 +175,14 @@ def validate_jsonschema_fail_fast( """ Raise as quickly as possible. - Use when any information about the error is not needed. + Use instead of ``validate_jsonschema`` when any information about the error(s) are not needed. """ if ( - err := next(_iter_errors_from_spec(spec, schema, rootschema=rootschema), None) + err := next(_iter_validator_errors(spec, schema, rootschema=rootschema), None) ) is not None: raise err -def _iter_errors_from_spec( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, -) -> _ErrsLazy: - """ - Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. - - ``schema`` and ``rootschema`` are not validated but instead considered as valid. - - We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. - Instead, we pass the ``schema`` directly to the validator class. - - This is done for two reasons: - - 1. The schema comes from Vega-Lite and is not based on the user - input, therefore there is no need to validate it in the first place. - 2. The "uri-reference" format checker fails for some of the - references as URIs in "$ref" are not encoded, e.g.: - - '#/definitions/ValueDefWithCondition' - - would be a valid $ref in a Vega-Lite schema but it is not a valid - URI reference due to the characters such as '<'. - """ - json_schema_draft_url = _get_json_schema_draft_url(rootschema or schema) - validator_cls: type[Validator] = cast( - "type[Validator]", - jsonschema.validators.validator_for({"$schema": json_schema_draft_url}), - ) - validator_kwargs: dict[str, Any] = {} - if hasattr(validator_cls, "FORMAT_CHECKER"): - validator_kwargs["format_checker"] = validator_cls.FORMAT_CHECKER - - if _USING_REFERENCING: - schema = _prepare_references(schema) - validator_kwargs["registry"] = _get_referencing_registry( - rootschema or schema, json_schema_draft_url - ) - else: - # No resolver is necessary if the schema is already the full schema - validator_kwargs["resolver"] = ( - jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema - ) - - validator = validator_cls(schema, **validator_kwargs) - return validator.iter_errors(spec) - - def _get_json_schema_draft_url(schema: dict[str, Any]) -> str: return schema.get("$schema", _DEFAULT_JSON_SCHEMA_DRAFT_URL) @@ -309,28 +218,170 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v -def _get_referencing_registry( - rootschema: dict[str, Any], json_schema_draft_url: str | None = None -) -> Registry: +def _prepare_validator(url: str, /) -> Callable[..., Validator]: + tp = cast( + "Callable[..., Validator]", + jsonschema.validators.validator_for({"$schema": url}), + ) + if hasattr(tp, "FORMAT_CHECKER"): + return partial(tp, format_checker=tp.FORMAT_CHECKER) + else: + return tp + + +if Version(importlib_version("jsonschema")) >= Version("4.18"): + from referencing import Registry + from referencing.jsonschema import specification_with + + def _construct_validator( + schema: dict[str, Any], rootschema: dict[str, Any] | None = None + ) -> Validator: + url = _get_json_schema_draft_url(rootschema or schema) + tp = _prepare_validator(url) + registry = _get_referencing_registry(rootschema or schema, url) + return tp(_prepare_references(schema), registry=registry) + + def _get_referencing_registry( + rootschema: dict[str, Any], json_schema_draft_url: str | None = None + ) -> Registry[Any]: + """ + Referencing is a dependency of newer jsonschema versions. + + See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + + We ignore 'import' ``mypy`` errors which happen when the ``referencing`` library + is not installed. + That's ok as in these cases this function is not called. + + We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case + ``referencing`` is installed. + """ + dialect_id = json_schema_draft_url or _get_json_schema_draft_url(rootschema) + specification = specification_with(dialect_id) + resource = specification.create_resource(rootschema) + return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) + + def _resolve_references( + schema: dict[str, Any], rootschema: dict[str, Any] | None = None + ) -> dict[str, Any]: + """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" + registry = _get_referencing_registry(rootschema or schema) + resolver = registry.resolver() + while "$ref" in schema: + schema = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]).contents + return schema +else: + + def _construct_validator( + schema: dict[str, Any], rootschema: dict[str, Any] | None = None + ) -> Validator: + tp = _prepare_validator(_get_json_schema_draft_url(rootschema or schema)) + resolver: Any = ( + jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema + ) + return tp(schema, resolver=resolver) + + def _resolve_references( + schema: dict[str, Any], rootschema: dict[str, Any] | None = None + ) -> dict[str, Any]: + """ + Resolve schema references until there is no $ref anymore in the top-level of the dictionary. + + ``jsonschema`` deprecated ``RefResolver`` in favor of ``referencing``. + + See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + """ + resolver = jsonschema.RefResolver.from_schema(rootschema or schema) + while "$ref" in schema: + with resolver.resolving(schema["$ref"]) as resolved: + schema = resolved + return schema + + +if Version(importlib_version("jsonschema")) >= Version("4.0.1"): + _json_path: Callable[[ValidationError], str] = operator.attrgetter("json_path") +else: + + def _json_path(err: ValidationError, /) -> str: + """ + Vendored backport for ``jsonschema.ValidationError.json_path`` property. + + See https://github.com/vega/altair/issues/3038. + """ + path = "$" + for elem in err.absolute_path: + if isinstance(elem, int): + path += "[" + str(elem) + "]" + else: + path += "." + elem + return path + + +_FN_PATH = cast("Callable[[tuple[str, ValidationError]], str]", operator.itemgetter(0)) +"""Key function for ``(json_path, ValidationError)``.""" +_FN_VALIDATOR = cast("Callable[[ValidationError], _ValidatorKeyword]", operator.attrgetter("validator")) # fmt: off +"""Key function for ``ValidationError.validator``.""" + + +def _message_len(err: ValidationError, /) -> int: + """Return length of a ``ValidationError`` message.""" + return len(err.message) + + +def _rechain(element: T, others: Iterable[T], /) -> Iterator[T]: """ - Referencing is a dependency of newer jsonschema versions. + Continue an iterator at the last popped ``element``. - See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + Equivalent to:: - We ignore 'import' ``mypy`` errors which happen when the ``referencing`` library - is not installed. - That's ok as in these cases this function is not called. + elements = 1, 2, 3, 4, 5 + it = iter(elements) + element = next(it) + it_continue = chain([element], it) - We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case - ``referencing`` is installed. """ - from referencing import Registry # type: ignore[import,unused-ignore] # noqa: I001 - from referencing.jsonschema import specification_with # type: ignore[import,unused-ignore] + yield element + yield from others - dialect_id = json_schema_draft_url or _get_json_schema_draft_url(rootschema) - specification = specification_with(dialect_id) - resource = specification.create_resource(rootschema) - return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) + +def _regroup( + errors: _Errs, /, *, key: Callable[[ValidationError], str] = _json_path +) -> _ErrsLazyGroup: + """ + Regroup error stream by a ``key`` function. + + Assumes ``errors`` are already sorted, which holds **only** at the end of ``validate_jsonschema``. + """ + for _, grouped_it in groupby(errors, key): + yield grouped_it + + +def _iter_validator_errors( + spec: _JsonParameter, + schema: dict[str, Any], + rootschema: dict[str, Any] | None = None, +) -> _ErrsLazy: + """ + Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. + + ``schema`` and ``rootschema`` are not validated but instead considered as valid. + + We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. + Instead, we pass the ``schema`` directly to the validator class. + + This is done for two reasons: + + 1. The schema comes from Vega-Lite and is not based on the user + input, therefore there is no need to validate it in the first place. + 2. The "uri-reference" format checker fails for some of the + references as URIs in "$ref" are not encoded, e.g.: + + '#/definitions/ValueDefWithCondition' + + would be a valid $ref in a Vega-Lite schema but it is not a valid + URI reference due to the characters such as '<'. + """ + return _construct_validator(schema, rootschema).iter_errors(spec) def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: @@ -344,7 +395,7 @@ def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: 2. ``_group_errors_by_json_path`` (part of) - Extracts the path for grouping. + Extracts the ``.json_path`` property for grouping. 3. Removes:: @@ -357,26 +408,17 @@ def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: from that function and so it's unlikely that this was what the user intended if the keyword is not present in the first place. """ # noqa: D400 + REQUIRED = "required" + VALUE = ["value"] for err in errors: if err_context := err.context: yield from _group_tree_leaves(err_context) - elif err.validator == "required" and err.validator_value == ["value"]: + elif err.validator == REQUIRED and err.validator_value == VALUE: continue else: yield _json_path(err), err -_fn_path = cast("Callable[[tuple[str, ValidationError]], str]", operator.itemgetter(0)) -"""Key function for ``(json_path, ValidationError)``.""" -_fn_validator = cast("Callable[[ValidationError], str]", operator.attrgetter("validator")) # fmt: off -"""Key function for ``ValidationError.validator``.""" - - -def _message_len(err: ValidationError, /) -> int: - """Return length of a ``ValidationError`` message.""" - return len(err.message) - - def _prune_subset_paths(json_path_errors: _IntoLazyGroup, /) -> Iterator[_Errs]: """ Removes key (json path), value (errors) pairs where the json path is fully contained in another json path. @@ -392,9 +434,9 @@ def _prune_subset_paths(json_path_errors: _IntoLazyGroup, /) -> Iterator[_Errs]: - Reversing allows prioritising more specific groups, since they are seen first - Then re-reversed, to keep seen order """ - rev_sort = sorted(json_path_errors, key=_fn_path, reverse=True) + rev_sort = sorted(json_path_errors, key=_FN_PATH, reverse=True) keeping: dict[str, _Errs] = {} - for unique_path, grouped_errors in groupby(rev_sort, key=_fn_path): + for unique_path, grouped_errors in groupby(rev_sort, key=_FN_PATH): if any(seen.startswith(unique_path) for seen in keeping): continue else: @@ -402,7 +444,9 @@ def _prune_subset_paths(json_path_errors: _IntoLazyGroup, /) -> Iterator[_Errs]: yield from islice(reversed(keeping.values()), 3) -def _groupby_validator(errors: _Errs, /) -> Iterator[tuple[str, _ErrsLazy]]: +def _groupby_validator( + errors: _Errs, / +) -> Iterator[tuple[_ValidatorKeyword, _ErrsLazy]]: """ Groups the errors by the json schema "validator" that casued the error. @@ -411,7 +455,7 @@ def _groupby_validator(errors: _Errs, /) -> Iterator[tuple[str, _ErrsLazy]]: was set although no additional properties are allowed then "validator" is `"additionalProperties`, etc. """ - yield from groupby(sorted(errors, key=_fn_validator), key=_fn_validator) + yield from groupby(sorted(errors, key=_FN_VALIDATOR), key=_FN_VALIDATOR) def _deduplicate_errors(grouped_errors: Iterator[_Errs], /) -> _ErrsLazy: @@ -423,10 +467,8 @@ def _deduplicate_errors(grouped_errors: Iterator[_Errs], /) -> _ErrsLazy: """ for by_path in grouped_errors: for validator, errors in _groupby_validator(by_path): - if validator == "additionalProperties": - errors = _shortest_any_of(errors) - elif validator == "enum": - errors = _prune_subset_enum(errors) + if fn := _FN_MAP_DEDUPLICATION.get(validator): + errors = fn(errors) yield from _distinct_messages(errors) @@ -471,6 +513,12 @@ def _prune_subset_enum(iterable: _Errs, /) -> _ErrsLazy: yield err +_FN_MAP_DEDUPLICATION: Mapping[_ValidatorKeyword, Callable[[_Errs], _ErrsLazy]] = { + "additionalProperties": _shortest_any_of, + "enum": _prune_subset_enum, +} + + def _subclasses(cls: type[Any]) -> Iterator[type[Any]]: """Breadth-first sequence of all classes which inherit from cls.""" seen = set() @@ -528,28 +576,6 @@ def _todict(obj: Any, context: dict[str, Any] | None, np_opt: Any, pd_opt: Any) return obj -def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None -) -> dict[str, Any]: - """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" - if _USING_REFERENCING: - registry = _get_referencing_registry(rootschema or schema) - # Using a different variable name to show that this is not the - # jsonschema.RefResolver but instead a Resolver from the referencing - # library - referencing_resolver = registry.resolver() - while "$ref" in schema: - schema = referencing_resolver.lookup( - _VEGA_LITE_ROOT_URI + schema["$ref"] - ).contents - else: - resolver = jsonschema.RefResolver.from_schema(rootschema or schema) - while "$ref" in schema: - with resolver.resolving(schema["$ref"]) as resolved: - schema = resolved - return schema - - class SchemaValidationError(jsonschema.ValidationError): """A wrapper for jsonschema.ValidationError with friendlier traceback.""" From 49aeec1f12f722d19121d16f977167048f647ec8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 12:37:53 +0100 Subject: [PATCH 26/92] refactor: Use language that more closely aligns with json schema Provided links to `json-schema` reference for more info. Also some minor edits to docs --- altair/utils/schemapi.py | 52 +++++++++++++++++++++--------------- tests/utils/test_schemapi.py | 16 +++++++---- tools/schemapi/schemapi.py | 51 ++++++++++++++++++++--------------- 3 files changed, 70 insertions(+), 49 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index ef477e769..b7fa7e4bf 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -50,9 +50,9 @@ from typing_extensions import TypeIs if sys.version_info >= (3, 11): - from typing import Never, Self + from typing import LiteralString, Never, Self else: - from typing_extensions import Never, Self + from typing_extensions import LiteralString, Never, Self if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -85,18 +85,19 @@ It just cannot be an empty string as we need to reference the schema registered in the ``referencing.Registry``.""" -_DEFAULT_JSON_SCHEMA_DRAFT_URL: Final = "http://json-schema.org/draft-07/schema#" +_DEFAULT_DIALECT_URI: LiteralString = "http://json-schema.org/draft-07/schema#" """ -Ideally, jsonschema specification would be parsed from the current Vega-Lite -schema instead of being hardcoded here as a default value. +Ideally, this would be parsed from the current Vega-Lite schema, and not hardcoded here. -However, due to circular imports between this module and the ``alt.vegalite`` -modules, this information is not yet available at this point as ``alt.vegalite`` -is only partially loaded. +However, due to circular imports between this module and ``alt.vegalite``, +this information is not yet available as the latter is only *partially* loaded. -The draft version which is used is unlikely to change often so it's ok to keep this. -There is also a test which validates that this value is always the same as in the Vega-Lite schema. +The `draft version`_ which is used is unlikely to change often so it's ok to keep this. + +.. _draft version: + https://json-schema.org/understanding-json-schema/reference/schema#declaring-a-dialect """ +# RELATED: tests/utils/test/schemapi.py/test_actual_json_schema_draft_is_same_as_hardcoded_default DEBUG_MODE: bool = True """ @@ -183,8 +184,17 @@ def validate_jsonschema_fail_fast( raise err -def _get_json_schema_draft_url(schema: dict[str, Any]) -> str: - return schema.get("$schema", _DEFAULT_JSON_SCHEMA_DRAFT_URL) +def _get_schema_dialect_uri(schema: dict[str, Any]) -> str: + """ + Return value of `$schema`_. + + Defines which JSON Schema draft ``schema`` was written for. + + .. _$schema: + https://json-schema.org/understanding-json-schema/reference/schema#schema + + """ + return schema.get("$schema", _DEFAULT_DIALECT_URI) def _prepare_references(schema: dict[str, Any], /) -> dict[str, Any]: @@ -218,11 +228,9 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v -def _prepare_validator(url: str, /) -> Callable[..., Validator]: - tp = cast( - "Callable[..., Validator]", - jsonschema.validators.validator_for({"$schema": url}), - ) +def _prepare_validator(uri: str, /) -> Callable[..., Validator]: + # tp = cast("Callable[..., Validator]", jsonschema.validators.validator_for({"$schema": uri})) + tp: Callable[..., Validator] = jsonschema.validators.validator_for({"$schema": uri}) if hasattr(tp, "FORMAT_CHECKER"): return partial(tp, format_checker=tp.FORMAT_CHECKER) else: @@ -236,9 +244,9 @@ def _prepare_validator(url: str, /) -> Callable[..., Validator]: def _construct_validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: - url = _get_json_schema_draft_url(rootschema or schema) - tp = _prepare_validator(url) - registry = _get_referencing_registry(rootschema or schema, url) + uri = _get_schema_dialect_uri(rootschema or schema) + tp = _prepare_validator(uri) + registry = _get_referencing_registry(rootschema or schema, uri) return tp(_prepare_references(schema), registry=registry) def _get_referencing_registry( @@ -256,7 +264,7 @@ def _get_referencing_registry( We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case ``referencing`` is installed. """ - dialect_id = json_schema_draft_url or _get_json_schema_draft_url(rootschema) + dialect_id = json_schema_draft_url or _get_schema_dialect_uri(rootschema) specification = specification_with(dialect_id) resource = specification.create_resource(rootschema) return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) @@ -275,7 +283,7 @@ def _resolve_references( def _construct_validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: - tp = _prepare_validator(_get_json_schema_draft_url(rootschema or schema)) + tp = _prepare_validator(_get_schema_dialect_uri(rootschema or schema)) resolver: Any = ( jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema ) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 231b60d74..10ea2ecb2 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -22,7 +22,7 @@ import altair as alt from altair import load_schema from altair.utils.schemapi import ( - _DEFAULT_JSON_SCHEMA_DRAFT_URL, + _DEFAULT_DIALECT_URI, SchemaBase, SchemaValidationError, Undefined, @@ -42,9 +42,9 @@ def test_actual_json_schema_draft_is_same_as_hardcoded_default(): - # See comments next to definition of _DEFAULT_JSON_SCHEMA_DRAFT_URL + # See comments next to definition of `_DEFAULT_DIALECT_URI` # for details why we need this test - assert _DEFAULT_JSON_SCHEMA_DRAFT_URL == _JSON_SCHEMA_DRAFT_URL, ( + assert _DEFAULT_DIALECT_URI == _JSON_SCHEMA_DRAFT_URL, ( "The default json schema URL, which is hardcoded," + " is not the same as the one used in the Vega-Lite schema." + " You need to update the default value." @@ -876,6 +876,8 @@ def test_chart_validation_errors(chart_func, expected_error_message): _SKIP_SLOW_BENCHMARKS: bool = True +_REPEAT_TIMES = 1000 +# to_dict optimize had no observable benefit @pytest.mark.skipif( @@ -886,9 +888,13 @@ def test_chart_validation_benchmark() -> None: """ Intended to isolate the `to_dict` call. - Repeated ``1000`` times, non-parametric: + Repeated ``_REPEAT_TIMES`` times, non-parametric: - in an attempt to limit the potential overhead of ``pytest`` - but enforce ``1`` thread, like a user-code would be. + + Results + ------- + 8/22/2024, 10:06:32 - 1000x in 108.46s (0:01:48) """ if TYPE_CHECKING: from typing import Iterator @@ -903,7 +909,7 @@ def _iter_charts(*, times: int) -> Iterator[ChartType]: charts: list[ChartType] = [fn() for fn, _ in chart_funcs_error_message] yield from chain.from_iterable(repeat(charts, times=times)) - for chart in _iter_charts(times=1000): + for chart in _iter_charts(times=_REPEAT_TIMES): with pytest.raises(SchemaValidationError): chart.to_dict(validate=True) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 465c5681b..e5e96be84 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -48,9 +48,9 @@ from typing_extensions import TypeIs if sys.version_info >= (3, 11): - from typing import Never, Self + from typing import LiteralString, Never, Self else: - from typing_extensions import Never, Self + from typing_extensions import LiteralString, Never, Self if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -83,18 +83,19 @@ It just cannot be an empty string as we need to reference the schema registered in the ``referencing.Registry``.""" -_DEFAULT_JSON_SCHEMA_DRAFT_URL: Final = "http://json-schema.org/draft-07/schema#" +_DEFAULT_DIALECT_URI: LiteralString = "http://json-schema.org/draft-07/schema#" """ -Ideally, jsonschema specification would be parsed from the current Vega-Lite -schema instead of being hardcoded here as a default value. +Ideally, this would be parsed from the current Vega-Lite schema, and not hardcoded here. -However, due to circular imports between this module and the ``alt.vegalite`` -modules, this information is not yet available at this point as ``alt.vegalite`` -is only partially loaded. +However, due to circular imports between this module and ``alt.vegalite``, +this information is not yet available as the latter is only *partially* loaded. -The draft version which is used is unlikely to change often so it's ok to keep this. -There is also a test which validates that this value is always the same as in the Vega-Lite schema. +The `draft version`_ which is used is unlikely to change often so it's ok to keep this. + +.. _draft version: + https://json-schema.org/understanding-json-schema/reference/schema#declaring-a-dialect """ +# RELATED: tests/utils/test/schemapi.py/test_actual_json_schema_draft_is_same_as_hardcoded_default DEBUG_MODE: bool = True """ @@ -181,8 +182,17 @@ def validate_jsonschema_fail_fast( raise err -def _get_json_schema_draft_url(schema: dict[str, Any]) -> str: - return schema.get("$schema", _DEFAULT_JSON_SCHEMA_DRAFT_URL) +def _get_schema_dialect_uri(schema: dict[str, Any]) -> str: + """ + Return value of `$schema`_. + + Defines which JSON Schema draft ``schema`` was written for. + + .. _$schema: + https://json-schema.org/understanding-json-schema/reference/schema#schema + + """ + return schema.get("$schema", _DEFAULT_DIALECT_URI) def _prepare_references(schema: dict[str, Any], /) -> dict[str, Any]: @@ -216,11 +226,8 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v -def _prepare_validator(url: str, /) -> Callable[..., Validator]: - tp = cast( - "Callable[..., Validator]", - jsonschema.validators.validator_for({"$schema": url}), - ) +def _prepare_validator(uri: str, /) -> Callable[..., Validator]: + tp: Callable[..., Validator] = jsonschema.validators.validator_for({"$schema": uri}) if hasattr(tp, "FORMAT_CHECKER"): return partial(tp, format_checker=tp.FORMAT_CHECKER) else: @@ -234,9 +241,9 @@ def _prepare_validator(url: str, /) -> Callable[..., Validator]: def _construct_validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: - url = _get_json_schema_draft_url(rootschema or schema) - tp = _prepare_validator(url) - registry = _get_referencing_registry(rootschema or schema, url) + uri = _get_schema_dialect_uri(rootschema or schema) + tp = _prepare_validator(uri) + registry = _get_referencing_registry(rootschema or schema, uri) return tp(_prepare_references(schema), registry=registry) def _get_referencing_registry( @@ -254,7 +261,7 @@ def _get_referencing_registry( We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case ``referencing`` is installed. """ - dialect_id = json_schema_draft_url or _get_json_schema_draft_url(rootschema) + dialect_id = json_schema_draft_url or _get_schema_dialect_uri(rootschema) specification = specification_with(dialect_id) resource = specification.create_resource(rootschema) return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) @@ -273,7 +280,7 @@ def _resolve_references( def _construct_validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: - tp = _prepare_validator(_get_json_schema_draft_url(rootschema or schema)) + tp = _prepare_validator(_get_schema_dialect_uri(rootschema or schema)) resolver: Any = ( jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema ) From 61bf44877472f5dca2107967b1c9084cc416f288 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:24:57 +0100 Subject: [PATCH 27/92] perf: Cache the result of `referencing.jsonschema.specification_with` Currently, this will only ever return a single result, based on https://json-schema.org/draft-07/json-schema-release-notes --- tools/schemapi/schemapi.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index e5e96be84..1a87fec11 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -38,7 +38,7 @@ from typing import ClassVar, Literal, Mapping from jsonschema.protocols import Validator, _JsonParameter - from referencing import Registry + from referencing import Registry, Specification from altair.typing import ChartType @@ -235,8 +235,19 @@ def _prepare_validator(uri: str, /) -> Callable[..., Validator]: if Version(importlib_version("jsonschema")) >= Version("4.18"): + from functools import lru_cache + from referencing import Registry - from referencing.jsonschema import specification_with + from referencing.jsonschema import specification_with as _specification_with + + @lru_cache(maxsize=None) + def specification_with(dialect_id: str, /) -> Specification[Any]: + """ + Directly wraps ``referencing.jsonschema.specification_with``. + + The original function returns one **immutable** object per JSON Schema **dialect**. + """ + return _specification_with(dialect_id) def _construct_validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None From c665bb17efa564d95177253899d8ed8f0727f562 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:25:23 +0100 Subject: [PATCH 28/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index b7fa7e4bf..075e1fa7e 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -40,7 +40,7 @@ from typing import ClassVar, Literal, Mapping from jsonschema.protocols import Validator, _JsonParameter - from referencing import Registry + from referencing import Registry, Specification from altair.typing import ChartType @@ -229,7 +229,6 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: def _prepare_validator(uri: str, /) -> Callable[..., Validator]: - # tp = cast("Callable[..., Validator]", jsonschema.validators.validator_for({"$schema": uri})) tp: Callable[..., Validator] = jsonschema.validators.validator_for({"$schema": uri}) if hasattr(tp, "FORMAT_CHECKER"): return partial(tp, format_checker=tp.FORMAT_CHECKER) @@ -238,8 +237,19 @@ def _prepare_validator(uri: str, /) -> Callable[..., Validator]: if Version(importlib_version("jsonschema")) >= Version("4.18"): + from functools import lru_cache + from referencing import Registry - from referencing.jsonschema import specification_with + from referencing.jsonschema import specification_with as _specification_with + + @lru_cache(maxsize=None) + def specification_with(dialect_id: str, /) -> Specification[Any]: + """ + Directly wraps ``referencing.jsonschema.specification_with``. + + The original function returns one **immutable** object per JSON Schema **dialect**. + """ + return _specification_with(dialect_id) def _construct_validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None From 69588b341984108f8abc0d16342925fb1bb96897 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:01:31 +0100 Subject: [PATCH 29/92] fix(typing): Address `None` propagation, Remove outdated doc If a `None` reached `_get_schema_dialect_uri`, it would cause a runtime error. `None` does not have a `get` method. Confident this is only a theoretical issue, but is now fixed at the source. --- tools/schemapi/schemapi.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 1a87fec11..2c66daed6 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -258,30 +258,23 @@ def _construct_validator( return tp(_prepare_references(schema), registry=registry) def _get_referencing_registry( - rootschema: dict[str, Any], json_schema_draft_url: str | None = None + rootschema: dict[str, Any], dialect_id: str ) -> Registry[Any]: """ Referencing is a dependency of newer jsonschema versions. See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 - - We ignore 'import' ``mypy`` errors which happen when the ``referencing`` library - is not installed. - That's ok as in these cases this function is not called. - - We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case - ``referencing`` is installed. """ - dialect_id = json_schema_draft_url or _get_schema_dialect_uri(rootschema) specification = specification_with(dialect_id) resource = specification.create_resource(rootschema) return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None + schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" - registry = _get_referencing_registry(rootschema or schema) + uri = _get_schema_dialect_uri(rootschema) + registry = _get_referencing_registry(rootschema or schema, uri) resolver = registry.resolver() while "$ref" in schema: schema = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]).contents @@ -298,7 +291,7 @@ def _construct_validator( return tp(schema, resolver=resolver) def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None + schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: """ Resolve schema references until there is no $ref anymore in the top-level of the dictionary. @@ -900,7 +893,7 @@ class SchemaBase: """ _schema: ClassVar[dict[str, Any] | Any] = None - _rootschema: ClassVar[dict[str, Any] | None] = None + _rootschema: ClassVar[dict[str, Any] | Any] = None _class_is_valid_at_instantiation: ClassVar[bool] = True def __init__(self, *args: Any, **kwds: Any) -> None: @@ -1221,13 +1214,17 @@ def validate( @classmethod def resolve_references(cls, schema: dict[str, Any] | None = None) -> dict[str, Any]: """Resolve references in the context of this object's schema or root schema.""" - schema_to_pass = schema or cls._schema - # For the benefit of mypy - assert schema_to_pass is not None - return _resolve_references( - schema=schema_to_pass, - rootschema=(cls._rootschema or cls._schema or schema), - ) + rootschema = cls._rootschema or cls._schema or schema + if rootschema is None: + name = type(cls).__name__ + msg = ( + f"{name}.resolve_references() provided only `None` values for:\n" + f"{schema=}, {cls._schema=}, {cls._rootschema=}.\n\n" + f"This variant indicates the class definition {name!r} is invalid." + ) + raise TypeError(msg) + else: + return _resolve_references(schema or cls._schema, rootschema=rootschema) @classmethod def validate_property( From 7fceab86f094855de60eef8b0695f6a61a028938 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:03:57 +0100 Subject: [PATCH 30/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 075e1fa7e..b27f6c3c3 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -260,30 +260,23 @@ def _construct_validator( return tp(_prepare_references(schema), registry=registry) def _get_referencing_registry( - rootschema: dict[str, Any], json_schema_draft_url: str | None = None + rootschema: dict[str, Any], dialect_id: str ) -> Registry[Any]: """ Referencing is a dependency of newer jsonschema versions. See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 - - We ignore 'import' ``mypy`` errors which happen when the ``referencing`` library - is not installed. - That's ok as in these cases this function is not called. - - We also have to ignore 'unused-ignore' errors as ``mypy`` raises those in case - ``referencing`` is installed. """ - dialect_id = json_schema_draft_url or _get_schema_dialect_uri(rootschema) specification = specification_with(dialect_id) resource = specification.create_resource(rootschema) return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None + schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" - registry = _get_referencing_registry(rootschema or schema) + uri = _get_schema_dialect_uri(rootschema) + registry = _get_referencing_registry(rootschema or schema, uri) resolver = registry.resolver() while "$ref" in schema: schema = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]).contents @@ -300,7 +293,7 @@ def _construct_validator( return tp(schema, resolver=resolver) def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None + schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: """ Resolve schema references until there is no $ref anymore in the top-level of the dictionary. @@ -902,7 +895,7 @@ class SchemaBase: """ _schema: ClassVar[dict[str, Any] | Any] = None - _rootschema: ClassVar[dict[str, Any] | None] = None + _rootschema: ClassVar[dict[str, Any] | Any] = None _class_is_valid_at_instantiation: ClassVar[bool] = True def __init__(self, *args: Any, **kwds: Any) -> None: @@ -1223,13 +1216,17 @@ def validate( @classmethod def resolve_references(cls, schema: dict[str, Any] | None = None) -> dict[str, Any]: """Resolve references in the context of this object's schema or root schema.""" - schema_to_pass = schema or cls._schema - # For the benefit of mypy - assert schema_to_pass is not None - return _resolve_references( - schema=schema_to_pass, - rootschema=(cls._rootschema or cls._schema or schema), - ) + rootschema = cls._rootschema or cls._schema or schema + if rootschema is None: + name = type(cls).__name__ + msg = ( + f"{name}.resolve_references() provided only `None` values for:\n" + f"{schema=}, {cls._schema=}, {cls._rootschema=}.\n\n" + f"This variant indicates the class definition {name!r} is invalid." + ) + raise TypeError(msg) + else: + return _resolve_references(schema or cls._schema, rootschema=rootschema) @classmethod def validate_property( From e6912c1abaacaa49c586de4e1bd7bbc5b51d43ce Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:34:57 +0100 Subject: [PATCH 31/92] docs: Improve `specification_with` --- tools/schemapi/schemapi.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 2c66daed6..f80683b7f 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -243,9 +243,20 @@ def _prepare_validator(uri: str, /) -> Callable[..., Validator]: @lru_cache(maxsize=None) def specification_with(dialect_id: str, /) -> Specification[Any]: """ - Directly wraps ``referencing.jsonschema.specification_with``. + Retrieve the `Specification`_ with the given dialect identifier. - The original function returns one **immutable** object per JSON Schema **dialect**. + Wraps `specification_with`_, which returns one **immutable** object per + JSON Schema **dialect**. + + Raises + ------ + ``UnknownDialect`` + if the given ``dialect_id`` isn't known + + .. _Specification: + https://referencing.readthedocs.io/en/stable/api/#referencing.Specification + .. _specification_with: + https://referencing.readthedocs.io/en/stable/api/#referencing.jsonschema.specification_with """ return _specification_with(dialect_id) From 283b69ed8dd23d40a7ad3304c341d58f9b4edabe Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:44:10 +0100 Subject: [PATCH 32/92] refactor: Rename`_get_referencing_registry` -> `_registry`, improve doc The original name is misleading, as this is a factory function. That is, a new `Registry` is created on each call. I think this could be impacting performance. Discarding the registry every time doesn't utilise the immutable properties provided by https://referencing.readthedocs.io/en/stable/api/#referencing.Registry --- tools/schemapi/schemapi.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index f80683b7f..e1fb6f5f8 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -265,16 +265,21 @@ def _construct_validator( ) -> Validator: uri = _get_schema_dialect_uri(rootschema or schema) tp = _prepare_validator(uri) - registry = _get_referencing_registry(rootschema or schema, uri) + registry = _registry(rootschema or schema, uri) return tp(_prepare_references(schema), registry=registry) - def _get_referencing_registry( - rootschema: dict[str, Any], dialect_id: str - ) -> Registry[Any]: + def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: """ - Referencing is a dependency of newer jsonschema versions. + Constructs a `Registry`_, adding the `Resource`_ produced by ``rootschema``. - See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + Requires at least ``jsonschema`` `v4.18.0a1`_. + + .. _Registry: + https://referencing.readthedocs.io/en/stable/api/#referencing.Registry + .. _Resource: + https://referencing.readthedocs.io/en/stable/api/#referencing.Resource + .. _v4.18.0a1: + https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 """ specification = specification_with(dialect_id) resource = specification.create_resource(rootschema) @@ -285,7 +290,7 @@ def _resolve_references( ) -> dict[str, Any]: """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" uri = _get_schema_dialect_uri(rootschema) - registry = _get_referencing_registry(rootschema or schema, uri) + registry = _registry(rootschema or schema, uri) resolver = registry.resolver() while "$ref" in schema: schema = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]).contents From 56be98401591a17a2a1c0857c1a47078dbdaa159 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:11:38 +0100 Subject: [PATCH 33/92] refactor: Renaming, docs to align with `jsonschema` Now very clear what is being wrapped, and where to find more information --- tools/schemapi/schemapi.py | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index e1fb6f5f8..31eb8f7ab 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -226,7 +226,20 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v -def _prepare_validator(uri: str, /) -> Callable[..., Validator]: +def _validator_for(uri: str, /) -> Callable[..., Validator]: + """ + Retrieve the constructor for a `Validator`_ class appropriate for validating the given schema. + + Parameters + ---------- + uri + Address pointing to the `$schema`_. + + .. _Validator: + https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol + .. _$schema: + https://json-schema.org/understanding-json-schema/reference/schema + """ tp: Callable[..., Validator] = jsonschema.validators.validator_for({"$schema": uri}) if hasattr(tp, "FORMAT_CHECKER"): return partial(tp, format_checker=tp.FORMAT_CHECKER) @@ -260,11 +273,24 @@ def specification_with(dialect_id: str, /) -> Specification[Any]: """ return _specification_with(dialect_id) - def _construct_validator( + def _validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: + """ + Constructs a `Validator`_ for future validation. + + Parameters + ---------- + schema + Schema that a spec will be validated against. + rootschema + Context to evaluate within. + + .. _Validator: + https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol + """ uri = _get_schema_dialect_uri(rootschema or schema) - tp = _prepare_validator(uri) + tp = _validator_for(uri) registry = _registry(rootschema or schema, uri) return tp(_prepare_references(schema), registry=registry) @@ -297,10 +323,10 @@ def _resolve_references( return schema else: - def _construct_validator( + def _validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: - tp = _prepare_validator(_get_schema_dialect_uri(rootschema or schema)) + tp = _validator_for(_get_schema_dialect_uri(rootschema or schema)) resolver: Any = ( jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema ) @@ -406,7 +432,7 @@ def _iter_validator_errors( would be a valid $ref in a Vega-Lite schema but it is not a valid URI reference due to the characters such as '<'. """ - return _construct_validator(schema, rootschema).iter_errors(spec) + return _validator(schema, rootschema).iter_errors(spec) def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: From 4c4b322a5f3661b1c11de17543fde482c01caf3c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:14:09 +0100 Subject: [PATCH 34/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 72 +++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index b27f6c3c3..254fad1c2 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -228,7 +228,20 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v -def _prepare_validator(uri: str, /) -> Callable[..., Validator]: +def _validator_for(uri: str, /) -> Callable[..., Validator]: + """ + Retrieve the constructor for a `Validator`_ class appropriate for validating the given schema. + + Parameters + ---------- + uri + Address pointing to the `$schema`_. + + .. _Validator: + https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol + .. _$schema: + https://json-schema.org/understanding-json-schema/reference/schema + """ tp: Callable[..., Validator] = jsonschema.validators.validator_for({"$schema": uri}) if hasattr(tp, "FORMAT_CHECKER"): return partial(tp, format_checker=tp.FORMAT_CHECKER) @@ -245,27 +258,56 @@ def _prepare_validator(uri: str, /) -> Callable[..., Validator]: @lru_cache(maxsize=None) def specification_with(dialect_id: str, /) -> Specification[Any]: """ - Directly wraps ``referencing.jsonschema.specification_with``. + Retrieve the `Specification`_ with the given dialect identifier. + + Wraps `specification_with`_, which returns one **immutable** object per + JSON Schema **dialect**. + + Raises + ------ + ``UnknownDialect`` + if the given ``dialect_id`` isn't known - The original function returns one **immutable** object per JSON Schema **dialect**. + .. _Specification: + https://referencing.readthedocs.io/en/stable/api/#referencing.Specification + .. _specification_with: + https://referencing.readthedocs.io/en/stable/api/#referencing.jsonschema.specification_with """ return _specification_with(dialect_id) - def _construct_validator( + def _validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: + """ + Constructs a `Validator`_ for future validation. + + Parameters + ---------- + schema + Schema that a spec will be validated against. + rootschema + Context to evaluate within. + + .. _Validator: + https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol + """ uri = _get_schema_dialect_uri(rootschema or schema) - tp = _prepare_validator(uri) - registry = _get_referencing_registry(rootschema or schema, uri) + tp = _validator_for(uri) + registry = _registry(rootschema or schema, uri) return tp(_prepare_references(schema), registry=registry) - def _get_referencing_registry( - rootschema: dict[str, Any], dialect_id: str - ) -> Registry[Any]: + def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: """ - Referencing is a dependency of newer jsonschema versions. + Constructs a `Registry`_, adding the `Resource`_ produced by ``rootschema``. - See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + Requires at least ``jsonschema`` `v4.18.0a1`_. + + .. _Registry: + https://referencing.readthedocs.io/en/stable/api/#referencing.Registry + .. _Resource: + https://referencing.readthedocs.io/en/stable/api/#referencing.Resource + .. _v4.18.0a1: + https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 """ specification = specification_with(dialect_id) resource = specification.create_resource(rootschema) @@ -276,17 +318,17 @@ def _resolve_references( ) -> dict[str, Any]: """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" uri = _get_schema_dialect_uri(rootschema) - registry = _get_referencing_registry(rootschema or schema, uri) + registry = _registry(rootschema or schema, uri) resolver = registry.resolver() while "$ref" in schema: schema = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]).contents return schema else: - def _construct_validator( + def _validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: - tp = _prepare_validator(_get_schema_dialect_uri(rootschema or schema)) + tp = _validator_for(_get_schema_dialect_uri(rootschema or schema)) resolver: Any = ( jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema ) @@ -392,7 +434,7 @@ def _iter_validator_errors( would be a valid $ref in a Vega-Lite schema but it is not a valid URI reference due to the characters such as '<'. """ - return _construct_validator(schema, rootschema).iter_errors(spec) + return _validator(schema, rootschema).iter_errors(spec) def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: From a3a148e88353c9be2369e30c4111fbd92dada254 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 19:18:12 +0100 Subject: [PATCH 35/92] refactor: Factor-out `_iter_validator_errors` --- tools/schemapi/schemapi.py | 49 +++++++++++++++----------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 31eb8f7ab..b0b4a97b5 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -148,8 +148,25 @@ def validate_jsonschema( - The first error is monkeypatched with a grouped iterator of all remaining errors - ``SchemaValidationError`` utilizes the patched attribute, to craft a more helpful error message. - However this breaks typing + + ``schema`` and ``rootschema`` are not validated but instead considered as valid. + + We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. + Instead, we pass the ``schema`` directly to the validator class. + + This is done for two reasons: + + 1. The schema comes from Vega-Lite and is not based on the user + input, therefore there is no need to validate it in the first place. + 2. The "uri-reference" format checker fails for some of the + references as URIs in "$ref" are not encoded, e.g.: + + '#/definitions/ValueDefWithCondition' + + would be a valid $ref in a Vega-Lite schema but it is not a valid + URI reference due to the characters such as '<'. """ - it_errors = _iter_validator_errors(spec, schema, rootschema=rootschema) + it_errors = _validator(schema, rootschema).iter_errors(spec) if first_error := next(it_errors, None): groups = _group_tree_leaves(_rechain(first_error, it_errors)) most_specific = _prune_subset_paths(groups) @@ -177,7 +194,7 @@ def validate_jsonschema_fail_fast( Use instead of ``validate_jsonschema`` when any information about the error(s) are not needed. """ if ( - err := next(_iter_validator_errors(spec, schema, rootschema=rootschema), None) + err := next(_validator(schema, rootschema).iter_errors(spec), None) ) is not None: raise err @@ -407,34 +424,6 @@ def _regroup( yield grouped_it -def _iter_validator_errors( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, -) -> _ErrsLazy: - """ - Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. - - ``schema`` and ``rootschema`` are not validated but instead considered as valid. - - We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. - Instead, we pass the ``schema`` directly to the validator class. - - This is done for two reasons: - - 1. The schema comes from Vega-Lite and is not based on the user - input, therefore there is no need to validate it in the first place. - 2. The "uri-reference" format checker fails for some of the - references as URIs in "$ref" are not encoded, e.g.: - - '#/definitions/ValueDefWithCondition' - - would be a valid $ref in a Vega-Lite schema but it is not a valid - URI reference due to the characters such as '<'. - """ - return _validator(schema, rootschema).iter_errors(spec) - - def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: """ Combines 3 previously distinct steps: From 5fd6787b71bd57263fbade1428a3f9fd1c8812a9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 21:54:13 +0100 Subject: [PATCH 36/92] chore: rename `tp` -> `validator` --- tools/schemapi/schemapi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index b0b4a97b5..288a44674 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -307,9 +307,9 @@ def _validator( https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol """ uri = _get_schema_dialect_uri(rootschema or schema) - tp = _validator_for(uri) + validator = _validator_for(uri) registry = _registry(rootschema or schema, uri) - return tp(_prepare_references(schema), registry=registry) + return validator(_prepare_references(schema), registry=registry) def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: """ @@ -343,11 +343,11 @@ def _resolve_references( def _validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: - tp = _validator_for(_get_schema_dialect_uri(rootschema or schema)) + validator = _validator_for(_get_schema_dialect_uri(rootschema or schema)) resolver: Any = ( jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema ) - return tp(schema, resolver=resolver) + return validator(schema, resolver=resolver) def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] From 4cc16194e173edc0b45381d7451d0cd9f26bd7fe Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 22:00:35 +0100 Subject: [PATCH 37/92] perf: Experiment with more cache layers All of this is intended to avoid repeating any work that has already been performed. --- tools/schemapi/schemapi.py | 64 ++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 288a44674..20f891cbf 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -7,7 +7,7 @@ import sys import textwrap from collections import defaultdict -from functools import partial +from functools import lru_cache, partial from importlib.metadata import version as importlib_version from itertools import chain, groupby, islice, zip_longest from math import ceil @@ -38,9 +38,9 @@ from typing import ClassVar, Literal, Mapping from jsonschema.protocols import Validator, _JsonParameter - from referencing import Registry, Specification from altair.typing import ChartType + from altair.vegalite.v5.schema._typing import Map if sys.version_info >= (3, 13): from typing import TypeIs @@ -243,6 +243,7 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v +@lru_cache(maxsize=None) def _validator_for(uri: str, /) -> Callable[..., Validator]: """ Retrieve the constructor for a `Validator`_ class appropriate for validating the given schema. @@ -265,11 +266,13 @@ def _validator_for(uri: str, /) -> Callable[..., Validator]: if Version(importlib_version("jsonschema")) >= Version("4.18"): - from functools import lru_cache - from referencing import Registry from referencing.jsonschema import specification_with as _specification_with + if TYPE_CHECKING: + from referencing import Specification + from referencing._core import Resolver + @lru_cache(maxsize=None) def specification_with(dialect_id: str, /) -> Specification[Any]: """ @@ -324,20 +327,63 @@ def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: .. _v4.18.0a1: https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 """ - specification = specification_with(dialect_id) - resource = specification.create_resource(rootschema) - return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) + global _REGISTRY_CACHE + cache_key = _registry_comp_key(rootschema, dialect_id) + if (registry := _REGISTRY_CACHE.get(cache_key, None)) is not None: + return registry + else: + specification = specification_with(dialect_id) + resource = specification.create_resource(rootschema) + registry = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource) + _REGISTRY_CACHE[cache_key] = registry + return registry + + def _registry_update( + root: dict[str, Any], dialect_id: str, resolver: Resolver[Any] + ): + global _REGISTRY_CACHE + cache_key = _registry_comp_key(root, dialect_id) + _REGISTRY_CACHE[cache_key] = resolver._registry def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" + root = rootschema or schema + if ("$ref" not in root) or ("$ref" not in schema): + return schema uri = _get_schema_dialect_uri(rootschema) - registry = _registry(rootschema or schema, uri) + registry = _registry(root, uri) resolver = registry.resolver() while "$ref" in schema: - schema = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]).contents + resolved = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]) + schema = resolved.contents + _registry_update(root, uri, resolved.resolver) return schema + + def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: + """ + Generate a simple-minded hash to identify a registry. + + Notes + ----- + Why the strange hash? + - **All** generated schemas hit the ``"$ref"`` branch. + - ``api.Then`` hits the len(...) 1 branch w/ ``{"type": "object"}``. + - Final branch is only hit by mock schemas in: + - `tests/utils/test_core.py::test_infer_encoding_types` + - `tests/utils/test_schemapi.py` + """ + if "$ref" in root: + k1 = root["$ref"] + elif len(root) == 1: + k1 = "".join(f"{s!s}" for s in chain(*root.items())) + else: + k1 = json.dumps(root, separators=(",", ":"), sort_keys=True) + return k1, dialect_id + + _REGISTRY_CACHE: dict[tuple[str, str], Registry[Any]] = {} + else: def _validator( From 509b67846234416da84621c0a6e6130f336be8d1 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 22 Aug 2024 22:02:31 +0100 Subject: [PATCH 38/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 121 +++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 43 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 254fad1c2..a97dbc789 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -9,7 +9,7 @@ import sys import textwrap from collections import defaultdict -from functools import partial +from functools import lru_cache, partial from importlib.metadata import version as importlib_version from itertools import chain, groupby, islice, zip_longest from math import ceil @@ -40,9 +40,9 @@ from typing import ClassVar, Literal, Mapping from jsonschema.protocols import Validator, _JsonParameter - from referencing import Registry, Specification from altair.typing import ChartType + from altair.vegalite.v5.schema._typing import Map if sys.version_info >= (3, 13): from typing import TypeIs @@ -150,8 +150,25 @@ def validate_jsonschema( - The first error is monkeypatched with a grouped iterator of all remaining errors - ``SchemaValidationError`` utilizes the patched attribute, to craft a more helpful error message. - However this breaks typing + + ``schema`` and ``rootschema`` are not validated but instead considered as valid. + + We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. + Instead, we pass the ``schema`` directly to the validator class. + + This is done for two reasons: + + 1. The schema comes from Vega-Lite and is not based on the user + input, therefore there is no need to validate it in the first place. + 2. The "uri-reference" format checker fails for some of the + references as URIs in "$ref" are not encoded, e.g.: + + '#/definitions/ValueDefWithCondition' + + would be a valid $ref in a Vega-Lite schema but it is not a valid + URI reference due to the characters such as '<'. """ - it_errors = _iter_validator_errors(spec, schema, rootschema=rootschema) + it_errors = _validator(schema, rootschema).iter_errors(spec) if first_error := next(it_errors, None): groups = _group_tree_leaves(_rechain(first_error, it_errors)) most_specific = _prune_subset_paths(groups) @@ -179,7 +196,7 @@ def validate_jsonschema_fail_fast( Use instead of ``validate_jsonschema`` when any information about the error(s) are not needed. """ if ( - err := next(_iter_validator_errors(spec, schema, rootschema=rootschema), None) + err := next(_validator(schema, rootschema).iter_errors(spec), None) ) is not None: raise err @@ -228,6 +245,7 @@ def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: yield k, v +@lru_cache(maxsize=None) def _validator_for(uri: str, /) -> Callable[..., Validator]: """ Retrieve the constructor for a `Validator`_ class appropriate for validating the given schema. @@ -250,11 +268,13 @@ def _validator_for(uri: str, /) -> Callable[..., Validator]: if Version(importlib_version("jsonschema")) >= Version("4.18"): - from functools import lru_cache - from referencing import Registry from referencing.jsonschema import specification_with as _specification_with + if TYPE_CHECKING: + from referencing import Specification + from referencing._core import Resolver + @lru_cache(maxsize=None) def specification_with(dialect_id: str, /) -> Specification[Any]: """ @@ -292,9 +312,9 @@ def _validator( https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol """ uri = _get_schema_dialect_uri(rootschema or schema) - tp = _validator_for(uri) + validator = _validator_for(uri) registry = _registry(rootschema or schema, uri) - return tp(_prepare_references(schema), registry=registry) + return validator(_prepare_references(schema), registry=registry) def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: """ @@ -309,30 +329,73 @@ def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: .. _v4.18.0a1: https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 """ - specification = specification_with(dialect_id) - resource = specification.create_resource(rootschema) - return Registry().with_resource(uri=_VEGA_LITE_ROOT_URI, resource=resource) + global _REGISTRY_CACHE + cache_key = _registry_comp_key(rootschema, dialect_id) + if (registry := _REGISTRY_CACHE.get(cache_key, None)) is not None: + return registry + else: + specification = specification_with(dialect_id) + resource = specification.create_resource(rootschema) + registry = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource) + _REGISTRY_CACHE[cache_key] = registry + return registry + + def _registry_update( + root: dict[str, Any], dialect_id: str, resolver: Resolver[Any] + ): + global _REGISTRY_CACHE + cache_key = _registry_comp_key(root, dialect_id) + _REGISTRY_CACHE[cache_key] = resolver._registry def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" + root = rootschema or schema + if ("$ref" not in root) or ("$ref" not in schema): + return schema uri = _get_schema_dialect_uri(rootschema) - registry = _registry(rootschema or schema, uri) + registry = _registry(root, uri) resolver = registry.resolver() while "$ref" in schema: - schema = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]).contents + resolved = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]) + schema = resolved.contents + _registry_update(root, uri, resolved.resolver) return schema + + def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: + """ + Generate a simple-minded hash to identify a registry. + + Notes + ----- + Why the strange hash? + - **All** generated schemas hit the ``"$ref"`` branch. + - ``api.Then`` hits the len(...) 1 branch w/ ``{"type": "object"}``. + - Final branch is only hit by mock schemas in: + - `tests/utils/test_core.py::test_infer_encoding_types` + - `tests/utils/test_schemapi.py` + """ + if "$ref" in root: + k1 = root["$ref"] + elif len(root) == 1: + k1 = "".join(f"{s!s}" for s in chain(*root.items())) + else: + k1 = json.dumps(root, separators=(",", ":"), sort_keys=True) + return k1, dialect_id + + _REGISTRY_CACHE: dict[tuple[str, str], Registry[Any]] = {} + else: def _validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: - tp = _validator_for(_get_schema_dialect_uri(rootschema or schema)) + validator = _validator_for(_get_schema_dialect_uri(rootschema or schema)) resolver: Any = ( jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema ) - return tp(schema, resolver=resolver) + return validator(schema, resolver=resolver) def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] @@ -409,34 +472,6 @@ def _regroup( yield grouped_it -def _iter_validator_errors( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, -) -> _ErrsLazy: - """ - Uses the relevant ``jsonschema`` validator to validate ``spec`` against ``schema`` using `` rootschema`` to resolve references. - - ``schema`` and ``rootschema`` are not validated but instead considered as valid. - - We don't use ``jsonschema.validate`` as this would validate the ``schema`` itself. - Instead, we pass the ``schema`` directly to the validator class. - - This is done for two reasons: - - 1. The schema comes from Vega-Lite and is not based on the user - input, therefore there is no need to validate it in the first place. - 2. The "uri-reference" format checker fails for some of the - references as URIs in "$ref" are not encoded, e.g.: - - '#/definitions/ValueDefWithCondition' - - would be a valid $ref in a Vega-Lite schema but it is not a valid - URI reference due to the characters such as '<'. - """ - return _validator(schema, rootschema).iter_errors(spec) - - def _group_tree_leaves(errors: _Errs, /) -> _IntoLazyGroup: """ Combines 3 previously distinct steps: From c594e55e76613d0363e3b419a00cb2f920547755 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:28:24 +0100 Subject: [PATCH 39/92] ci: Add patterns for `pyright` Fixes: ``` 2024-08-23 16:23:02.509 [info] [Info - 4:23:02 PM] (15104) Loading pyproject.toml file at c:\Users\danie\Documents\GitHub\altair\pyproject.toml 2024-08-23 16:23:02.511 [info] [Info - 4:23:02 PM] (15104) No include entries specified; assuming c:\Users\danie\Documents\GitHub\altair 2024-08-23 16:23:04.953 [info] [Info - 4:23:04 PM] (15104) Found 8234 source files 2024-08-23 16:23:50.675 [info] [Warn - 4:23:50 PM] (15104) Workspace indexing has hit its upper limit: 5000 files ``` --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 47abd59f1..0fd722574 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -443,3 +443,10 @@ extraPaths=["./tools"] pythonPlatform="All" pythonVersion="3.8" reportUnusedExpression="none" +include=[ + "./altair/**/*.py", + ".doc/*.py", + "./sphinxext/**/*.py", + "./tests/**/*.py", + "./tools/**/*.py", +] From 5334a2c23a3b402e3561a3ec218f60ad5204e40c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 24 Aug 2024 15:43:43 +0100 Subject: [PATCH 40/92] feat: Adds `schemapi.__all__` Related https://github.com/vega/altair/pull/3556 --- tests/utils/test_schemapi.py | 23 +++++++++-------------- tests/vegalite/v5/test_api.py | 7 ++++--- tools/schemapi/schemapi.py | 11 +++++++++++ 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 10ea2ecb2..1a20b9371 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -21,14 +21,8 @@ import altair as alt from altair import load_schema -from altair.utils.schemapi import ( - _DEFAULT_DIALECT_URI, - SchemaBase, - SchemaValidationError, - Undefined, - UndefinedType, - _FromDict, -) +from altair.utils import schemapi +from altair.utils.schemapi import SchemaBase, Undefined, UndefinedType from altair.vegalite.v5.schema.channels import X from altair.vegalite.v5.schema.core import FieldOneOfPredicate, Legend from vega_datasets import data @@ -44,7 +38,7 @@ def test_actual_json_schema_draft_is_same_as_hardcoded_default(): # See comments next to definition of `_DEFAULT_DIALECT_URI` # for details why we need this test - assert _DEFAULT_DIALECT_URI == _JSON_SCHEMA_DRAFT_URL, ( + assert schemapi._DEFAULT_DIALECT_URI == _JSON_SCHEMA_DRAFT_URL, ( "The default json schema URL, which is hardcoded," + " is not the same as the one used in the Vega-Lite schema." + " You need to update the default value." @@ -392,10 +386,11 @@ class BadSchema(SchemaBase): @pytest.mark.parametrize("use_json", [True, False]) def test_hash_schema(use_json): classes = _TestSchema._default_wrapper_classes() + FromDict = schemapi._FromDict for cls in classes: - hsh1 = _FromDict.hash_schema(cls._schema, use_json=use_json) - hsh2 = _FromDict.hash_schema(cls._schema, use_json=use_json) + hsh1 = FromDict.hash_schema(cls._schema, use_json=use_json) + hsh2 = FromDict.hash_schema(cls._schema, use_json=use_json) assert hsh1 == hsh2 assert hash(hsh1) == hash(hsh2) @@ -407,7 +402,7 @@ def test_schema_validation_error(): except jsonschema.ValidationError as err: the_err = err - assert isinstance(the_err, SchemaValidationError) + assert isinstance(the_err, schemapi.SchemaValidationError) message = str(the_err) assert the_err.message in message @@ -871,7 +866,7 @@ def test_chart_validation_errors(chart_func, expected_error_message): warnings.filterwarnings("ignore", category=UserWarning) chart = chart_func() expected_error_message = inspect.cleandoc(expected_error_message) - with pytest.raises(SchemaValidationError, match=expected_error_message): + with pytest.raises(schemapi.SchemaValidationError, match=expected_error_message): chart.to_dict() @@ -910,7 +905,7 @@ def _iter_charts(*, times: int) -> Iterator[ChartType]: yield from chain.from_iterable(repeat(charts, times=times)) for chart in _iter_charts(times=_REPEAT_TIMES): - with pytest.raises(SchemaValidationError): + with pytest.raises(schemapi.SchemaValidationError): chart.to_dict(validate=True) diff --git a/tests/vegalite/v5/test_api.py b/tests/vegalite/v5/test_api.py index 29d68d1ea..f5a150556 100644 --- a/tests/vegalite/v5/test_api.py +++ b/tests/vegalite/v5/test_api.py @@ -22,6 +22,7 @@ from packaging.version import Version import altair as alt +from altair.utils import schemapi from altair.utils.schemapi import Optional, Undefined try: @@ -527,8 +528,6 @@ def test_when_labels_position_based_on_condition() -> None: import numpy as np import pandas as pd - from altair.utils.schemapi import SchemaValidationError - rand = np.random.RandomState(42) df = pd.DataFrame({"xval": range(100), "yval": rand.randn(100).cumsum()}) @@ -569,7 +568,9 @@ def test_when_labels_position_based_on_condition() -> None: fail_condition = alt.condition( param_width < 200, alt.value("red"), alt.value("black") ) - with pytest.raises(SchemaValidationError, match="invalid value for `expr`"): + with pytest.raises( + schemapi.SchemaValidationError, match="invalid value for `expr`" + ): alt.param(expr=fail_condition) # type: ignore diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 20f891cbf..dbe2d24cb 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -73,6 +73,17 @@ ] """Non-exhaustive listing of possible literals in ``ValidationError.validator``""" +__all__ = [ + "Optional", # altair.utils + "SchemaBase", # altair.vegalite.v5.schema.core + "Undefined", # altair.utils + "UndefinedType", # altair.vegalite.v5.schema.core -> (side-effect relied on to propagate to alt.__init__) + "_resolve_references", # tools.schemapi.utils -> tools.generate_schema_wrapper + "_subclasses", # altair.vegalite.v5.schema.core + "is_undefined", # altair.typing + "validate_jsonschema", # altair.utils.display + "with_property_setters", # altair.vegalite.v5.schema.channels +] _VEGA_LITE_ROOT_URI: Final = "urn:vega-lite-schema" """ From 2751bb2399ec9bf2627b189dbfa53d712069819c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 11:47:00 +0100 Subject: [PATCH 41/92] fix: Use uniform docs in version-gated functions Both versions of these functions can be targeted by an IDE. But hovering over the name (anywhere but the actual definition) displays *only* the final docstring on hover. This duplicates the docs, and adds a **NOTE** comment to be super clear when viewing the code itself. --- tools/schemapi/schemapi.py | 41 ++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index dbe2d24cb..7515eac93 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -317,9 +317,12 @@ def _validator( rootschema Context to evaluate within. + We have **both** a current & a backwards-compatible version of this function. + .. _Validator: https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol """ + # NOTE: This is the current version uri = _get_schema_dialect_uri(rootschema or schema) validator = _validator_for(uri) registry = _registry(rootschema or schema, uri) @@ -359,7 +362,17 @@ def _registry_update( def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: - """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" + """ + Resolve schema references until there is no ``"$ref"`` anymore in the top-level ``dict``. + + ``jsonschema`` deprecated ``RefResolver`` in favor of `referencing`_. + + We have **both** a current & a backwards-compatible version of this function. + + .. _referencing: + https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + """ + # NOTE: This is the current version root = rootschema or schema if ("$ref" not in root) or ("$ref" not in schema): return schema @@ -400,6 +413,22 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: def _validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: + """ + Constructs a `Validator`_ for future validation. + + We have **both** a current & a backwards-compatible version of this function. + + Parameters + ---------- + schema + Schema that a spec will be validated against. + rootschema + Context to evaluate within. + + .. _Validator: + https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol + """ + # NOTE: This is the backwards-compatible version validator = _validator_for(_get_schema_dialect_uri(rootschema or schema)) resolver: Any = ( jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema @@ -410,12 +439,16 @@ def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: """ - Resolve schema references until there is no $ref anymore in the top-level of the dictionary. + Resolve schema references until there is no ``"$ref"`` anymore in the top-level ``dict``. - ``jsonschema`` deprecated ``RefResolver`` in favor of ``referencing``. + ``jsonschema`` deprecated ``RefResolver`` in favor of `referencing`_. - See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + We have **both** a current & a backwards-compatible version of this function. + + .. _referencing: + https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 """ + # NOTE: This is the backwards-compatible version resolver = jsonschema.RefResolver.from_schema(rootschema or schema) while "$ref" in schema: with resolver.resolving(schema["$ref"]) as resolved: From f235284b37611b1b82c8f02b8b1eb982f6dbe4d7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 11:51:32 +0100 Subject: [PATCH 42/92] refactor: Update imports --- tools/schemapi/schemapi.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 7515eac93..4d0282243 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -6,7 +6,7 @@ import operator import sys import textwrap -from collections import defaultdict +from collections import defaultdict, deque from functools import lru_cache, partial from importlib.metadata import version as importlib_version from itertools import chain, groupby, islice, zip_longest @@ -14,13 +14,10 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Dict, - Final, Iterable, - Iterator, - KeysView, List, + Mapping, Sequence, TypeVar, Union, @@ -35,7 +32,7 @@ from packaging.version import Version if TYPE_CHECKING: - from typing import ClassVar, Literal, Mapping + from typing import Callable, ClassVar, Final, Iterator, KeysView, Literal from jsonschema.protocols import Validator, _JsonParameter From bd1d5800a969fba81d57098aef1fe10d02c48472 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 12:52:37 +0100 Subject: [PATCH 43/92] chore: Remove stale `SchemaBase.from_json` comment --- tools/schemapi/schemapi.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 4d0282243..639520cd4 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1293,12 +1293,7 @@ def from_dict( @classmethod def from_json( - cls, - json_string: str, - validate: bool = True, - **kwargs: Any, - # Type hints for this method would get rather complicated - # if we want to provide a more specific return type + cls, json_string: str, validate: bool = True, **kwargs: Any ) -> ChartType: """ Instantiate the object from a valid JSON string. From 8ca426675379f4e65d025075c81bc099c6cdadb3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 13:38:10 +0100 Subject: [PATCH 44/92] perf: Refactor `SchemaBase.from_dict` and co There's quite a lot in here, so I've left my notes in `_subclasses` temporarily. - Removed unused `use_json=False` branch - Evaluate the hash table **once** and not every time `SchemaBase.from_dict` is called --- tests/utils/test_schemapi.py | 32 +++++--- tools/schemapi/schemapi.py | 149 +++++++++++++++++++++++------------ 2 files changed, 121 insertions(+), 60 deletions(-) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 1a20b9371..f87d8fc42 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -10,7 +10,7 @@ import warnings from collections import deque from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Iterable, Sequence +from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Sequence import jsonschema import jsonschema.exceptions @@ -48,7 +48,23 @@ def test_actual_json_schema_draft_is_same_as_hardcoded_default(): class _TestSchema(SchemaBase): @classmethod def _default_wrapper_classes(cls): - return _TestSchema.__subclasses__() + return schemapi._subclasses(_TestSchema) + + @classmethod + def from_dict( + cls: type[schemapi.TSchemaBase], dct: dict[str, Any], validate: bool = True + ) -> schemapi.TSchemaBase: + """ + Overrides ``SchemaBase``, which uses a cached ``FromDict.hash_tps``. + + The cached version is based on an iterator over: + + schemapi._subclasses(VegaLiteSchema) + """ + if validate: + cls.validate(dct) + converter = schemapi._FromDict(cls._default_wrapper_classes()) + return converter.from_dict(dct, cls) class MySchema(_TestSchema): @@ -383,14 +399,10 @@ class BadSchema(SchemaBase): assert str(err.value).startswith("Cannot instantiate object") -@pytest.mark.parametrize("use_json", [True, False]) -def test_hash_schema(use_json): - classes = _TestSchema._default_wrapper_classes() - FromDict = schemapi._FromDict - - for cls in classes: - hsh1 = FromDict.hash_schema(cls._schema, use_json=use_json) - hsh2 = FromDict.hash_schema(cls._schema, use_json=use_json) +def test_hash_schema(): + for cls in _TestSchema._default_wrapper_classes(): + hsh1 = schemapi._hash_schema(cls._schema) + hsh2 = schemapi._hash_schema(cls._schema) assert hsh1 == hsh2 assert hash(hsh1) == hash(hsh2) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 639520cd4..b3fac3318 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -646,17 +646,6 @@ def _prune_subset_enum(iterable: _Errs, /) -> _ErrsLazy: } -def _subclasses(cls: type[Any]) -> Iterator[type[Any]]: - """Breadth-first sequence of all classes which inherit from cls.""" - seen = set() - current: set[type[Any]] = {cls} - while current: - seen |= current - current = set(chain.from_iterable(cls.__subclasses__() for cls in current)) - for cls in current - seen: - yield cls - - def _from_array_like(obj: Iterable[Any], /) -> list[Any]: try: ser = nw.from_native(obj, strict=True, series_only=True) @@ -1288,7 +1277,13 @@ def from_dict( """ if validate: cls.validate(dct) - converter = _FromDict(cls._default_wrapper_classes()) + # NOTE: the breadth-first search occurs only once now + # `_FromDict` is purely ClassVar/classmethods + converter: type[_FromDict] | _FromDict = ( + _FromDict + if _FromDict.hash_tps + else _FromDict(cls._default_wrapper_classes()) + ) return converter.from_dict(dct, cls) @classmethod @@ -1389,6 +1384,9 @@ def _passthrough(*args: Any, **kwds: Any) -> Any | dict[str, Any]: def _freeze(val): + # NOTE: No longer referenced + # - Previously only called during tests + # - Not during any library code if isinstance(val, dict): return frozenset((k, _freeze(v)) for k, v in val.items()) elif isinstance(val, set): @@ -1399,6 +1397,64 @@ def _freeze(val): return val +def _hash_schema( + schema: _JsonParameter, + /, + *, + exclude: Iterable[str] = frozenset( + ("definitions", "title", "description", "$schema", "id") + ), +) -> int: + """ + Return the hash value for a ``schema``. + + Parameters + ---------- + schema + ``SchemaBase._schema``. + exclude + ``schema`` keys which are not considered when identifying equivalence. + """ + if isinstance(schema, Mapping): + schema = {k: v for k, v in schema.items() if k not in exclude} + return hash(json.dumps(schema, sort_keys=True)) + + +def _subclasses(cls: type[TSchemaBase]) -> Iterator[type[TSchemaBase]]: + """ + Breadth-first sequence of all classes which inherit from ``cls``. + + Notes + ----- + - `__subclasses__()` alone isn't helpful, as that is only immediate subclasses + - Deterministic + - Used for `SchemaBase` & `VegaLiteSchema` + - In practice, it provides an iterator over all classes in the schema below `VegaLiteSchema` + - The first one is `Root` + - The order itself, I don't think is important + - But probably important that it doesn't change + - Thinking they used an iterator so that the subclasses are evaluated after they have all been defined + + - `Chart` seems to try to avoid calling this + - Using `TopLevelMixin.__subclasses__()` first if possible + - It is always called during `Chart.encode()` + - Chart.encode() + - altair.utils.core.infer_encoding_types + - _ChannelCache.infer_encoding_types + - _ChannelCache._wrap_in_channel + - SchemaBase.from_dict (recursive, hot loop, validate =False, within a try/except) + - _FromDict(cls._default_wrapper_classes()) + - schemapi._subclasses(schema.core.VegaLiteSchema) + """ + seen = set() + current: set[type[TSchemaBase]] = {cls} + while current: + seen |= current + current = set(chain.from_iterable(cls.__subclasses__() for cls in current)) + for cls in current - seen: + yield cls + + class _FromDict: """ Class used to construct SchemaBase class hierarchies from a dict. @@ -1408,40 +1464,31 @@ class _FromDict: specified in the ``wrapper_classes`` positional-only argument to the constructor. """ - _hash_exclude_keys = ("definitions", "title", "description", "$schema", "id") + hash_tps: ClassVar[defaultdict[int, deque[type[SchemaBase]]]] = defaultdict(deque) + """ + Maps unique schemas to corresponding types. - def __init__(self, wrapper_classes: Iterable[type[SchemaBase]], /) -> None: - # Create a mapping of a schema hash to a list of matching classes - # This lets us quickly determine the correct class to construct - self.class_dict: dict[int, list[type[SchemaBase]]] = defaultdict(list) - for tp in wrapper_classes: - if tp._schema is not None: - self.class_dict[self.hash_schema(tp._schema)].append(tp) + The logic is that after removing a subset of keys, some schemas are identical. - @classmethod - def hash_schema(cls, schema: dict[str, Any], use_json: bool = True) -> int: - """ - Compute a python hash for a nested dictionary which properly handles dicts, lists, sets, and tuples. + If there are multiple matches, we use the first one in the ``deque``. - At the top level, the function excludes from the hashed schema all keys - listed in `exclude_keys`. + ``_subclasses`` yields the results of a `breadth-first search`_, + so the first matching class is the most general match. - This implements two methods: one based on conversion to JSON, and one based - on recursive conversions of unhashable to hashable types; the former seems - to be slightly faster in several benchmarks. - """ - if cls._hash_exclude_keys and isinstance(schema, dict): - schema = { - key: val - for key, val in schema.items() - if key not in cls._hash_exclude_keys - } - s: Any = json.dumps(schema, sort_keys=True) if use_json else _freeze(schema) - return hash(s) + .. _breadth-first search: + https://en.wikipedia.org/wiki/Breadth-first_search + """ + + def __init__(self, wrapper_classes: Iterator[type[SchemaBase]], /) -> None: + cls = type(self) + for tp in wrapper_classes: + if tp._schema is not None: + cls.hash_tps[_hash_schema(tp._schema)].append(tp) @overload + @classmethod def from_dict( - self, + cls, dct: TSchemaBase, tp: None = ..., schema: None = ..., @@ -1449,8 +1496,9 @@ def from_dict( default_class: Any = ..., ) -> TSchemaBase: ... @overload + @classmethod def from_dict( - self, + cls, dct: dict[str, Any] | list[dict[str, Any]], tp: Any = ..., schema: Any = ..., @@ -1458,8 +1506,9 @@ def from_dict( default_class: type[TSchemaBase] = ..., # pyright: ignore[reportInvalidTypeVarUse] ) -> TSchemaBase: ... @overload + @classmethod def from_dict( - self, + cls, dct: dict[str, Any], tp: None = ..., schema: dict[str, Any] = ..., @@ -1467,8 +1516,9 @@ def from_dict( default_class: Any = ..., ) -> SchemaBase: ... @overload + @classmethod def from_dict( - self, + cls, dct: dict[str, Any], tp: type[TSchemaBase], schema: None = ..., @@ -1476,16 +1526,18 @@ def from_dict( default_class: Any = ..., ) -> TSchemaBase: ... @overload + @classmethod def from_dict( - self, + cls, dct: dict[str, Any] | list[dict[str, Any]], tp: type[TSchemaBase], schema: dict[str, Any], rootschema: dict[str, Any] | None = ..., default_class: Any = ..., ) -> Never: ... + @classmethod def from_dict( - self, + cls, dct: dict[str, Any] | list[dict[str, Any]] | TSchemaBase, tp: type[TSchemaBase] | None = None, schema: dict[str, Any] | None = None, @@ -1502,18 +1554,15 @@ def from_dict( root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: - # If there are multiple matches, we use the first one in the dict. - # Our class dict is constructed breadth-first from top to bottom, - # so the first class that matches is the most general match. current_schema = schema root_schema = rootschema or current_schema - matches = self.class_dict[self.hash_schema(current_schema)] - target_tp = matches[0] if matches else default_class + matches = cls.hash_tps[_hash_schema(current_schema)] + target_tp = next(iter(matches), default_class) else: msg = "Must provide either `tp` or `schema`, but not both." raise ValueError(msg) - from_dict = partial(self.from_dict, rootschema=root_schema) + from_dict = partial(cls.from_dict, rootschema=root_schema) # Can also return a list? resolved = _resolve_references(current_schema, root_schema) if "anyOf" in resolved or "oneOf" in resolved: From 933c045f54a9987ca6a6c43f22e6a1e33fe842d9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 13:51:32 +0100 Subject: [PATCH 45/92] perf: Avoid expensive exceptions in `_FromDict.from_dict` Try running `hatch test --all` on main vs this. Locally, each version gets a 1.32-1.53x speedup. Somehow this also includes `3.11`, despite that being the version "zero-cost" exceptions were introduced. --- tools/schemapi/schemapi.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index b3fac3318..fd1164fe3 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1568,11 +1568,12 @@ def from_dict( if "anyOf" in resolved or "oneOf" in resolved: schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) for possible in schemas: - try: - validate_jsonschema_fail_fast(dct, possible, rootschema=root_schema) - except ValidationError: - continue - else: + # NOTE: Instead of raise/except/continue + # Pre-"zero-cost" exceptions, this has a huge performance gain. + # https://docs.python.org/3/whatsnew/3.11.html#misc + # https://github.com/python/cpython/blob/9b3749849eda4012261a112b22eb07f26fd345a9/InternalDocs/exception_handling.md + it_errs = _validator(possible, root_schema).iter_errors(dct) + if next(it_errs, None) is None: return from_dict(dct, schema=possible, default_class=target_tp) if _is_dict(dct): From 1e5993be41feb9f6620262d7a945ba8faaef5c75 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 13:53:46 +0100 Subject: [PATCH 46/92] chore: Add todo for `__init_subclasses__` Thinking that moving the checks the the definition of the class (rather than the instance) could reduce a lot of calls. --- tools/schemapi/schemapi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index fd1164fe3..29fe8be64 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -999,6 +999,8 @@ class SchemaBase: the _rootschema class attribute) which is used for validation. """ + # TODO: Implement `ClassVar` validation using https://peps.python.org/pep-0487/ + _schema: ClassVar[dict[str, Any] | Any] = None _rootschema: ClassVar[dict[str, Any] | Any] = None _class_is_valid_at_instantiation: ClassVar[bool] = True From 67b7ae431cd1b5a0b4093242a244fb010b45889d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:24:16 +0100 Subject: [PATCH 47/92] test: Update & add snapshots for `test_chart_validation_benchmark` --- tests/utils/test_schemapi.py | 69 +++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index f87d8fc42..4ba3ea9ca 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -884,16 +884,19 @@ def test_chart_validation_errors(chart_func, expected_error_message): _SKIP_SLOW_BENCHMARKS: bool = True _REPEAT_TIMES = 1000 -# to_dict optimize had no observable benefit +@pytest.mark.parametrize("to_or_from", ["to_dict-validate", "to_dict", "from_dict"]) +@pytest.mark.filterwarnings("ignore:.*:UserWarning") @pytest.mark.skipif( _SKIP_SLOW_BENCHMARKS, reason="Should only be run in isolation to test single threaded performance.", ) -def test_chart_validation_benchmark() -> None: +def test_chart_validation_benchmark( + to_or_from: Literal["to_dict-validate", "to_dict", "from_dict"], +) -> None: """ - Intended to isolate the `to_dict` call. + Intended to isolate `Chart.(to|from)_dict.` calls. Repeated ``_REPEAT_TIMES`` times, non-parametric: - in an attempt to limit the potential overhead of ``pytest`` @@ -901,24 +904,64 @@ def test_chart_validation_benchmark() -> None: Results ------- - 8/22/2024, 10:06:32 - 1000x in 108.46s (0:01:48) + ``` + _REPEAT_TIMES = 1000 + pytest -k test_chart_validation_benchmark --numprocesses=3 --durations=3 tests + + # Pre-`SchemaBase.from_dict` refactor (3.12.3) + 108.16s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] + 84.62s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] + 66.71s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] + + # Post-`SchemaBase.from_dict` refactor (3.12.3) + 107.84s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] + 50.43s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] + 67.07s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] + ``` """ + from itertools import chain, repeat + if TYPE_CHECKING: from typing import Iterator from altair.typing import ChartType - def _iter_charts(*, times: int) -> Iterator[ChartType]: - from itertools import chain, repeat + def _iter_charts() -> Iterator[ChartType]: + """ + Ensures only len(chart_funcs_error_message) actual charts are constructed. + + The `to_dict` calls are what gets multiplied + """ + charts: list[ChartType] = [fn() for fn, _ in chart_funcs_error_message] + yield from chain.from_iterable(repeat(charts, times=_REPEAT_TIMES)) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - charts: list[ChartType] = [fn() for fn, _ in chart_funcs_error_message] - yield from chain.from_iterable(repeat(charts, times=times)) + def _iter_chart_factory() -> Iterator[ChartType]: + """ + Validation not the bottleneck, but encode is. - for chart in _iter_charts(times=_REPEAT_TIMES): - with pytest.raises(schemapi.SchemaValidationError): - chart.to_dict(validate=True) + Ensures at least `times` * len(chart_funcs_error_message) .encode calls are made. + """ + chart_funcs: list[Callable[[], ChartType]] = [ + fn for fn, _ in chart_funcs_error_message + ] + for fn in chain.from_iterable(repeat(chart_funcs, times=_REPEAT_TIMES)): + yield fn() + + def _to_dict(validate: bool) -> None: + if validate: + for chart in _iter_charts(): + with pytest.raises(schemapi.SchemaValidationError): + chart.to_dict(validate=validate) + else: + for chart in _iter_charts(): + chart.to_dict(validate=validate) + + if to_or_from == "to_dict": + _to_dict(validate=False) + elif to_or_from == "to_dict-validate": + _to_dict(validate=True) + else: + assert list(_iter_chart_factory()) def test_multiple_field_strings_in_condition(): From 8725a6cee48096b801981c49b054fe634a5df276 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:25:07 +0100 Subject: [PATCH 48/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 230 +++++++++++++++++++++++++++------------ 1 file changed, 159 insertions(+), 71 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index a97dbc789..5188863ee 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -8,7 +8,7 @@ import operator import sys import textwrap -from collections import defaultdict +from collections import defaultdict, deque from functools import lru_cache, partial from importlib.metadata import version as importlib_version from itertools import chain, groupby, islice, zip_longest @@ -16,13 +16,10 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Dict, - Final, Iterable, - Iterator, - KeysView, List, + Mapping, Sequence, TypeVar, Union, @@ -37,7 +34,7 @@ from packaging.version import Version if TYPE_CHECKING: - from typing import ClassVar, Literal, Mapping + from typing import Callable, ClassVar, Final, Iterator, KeysView, Literal from jsonschema.protocols import Validator, _JsonParameter @@ -75,6 +72,17 @@ ] """Non-exhaustive listing of possible literals in ``ValidationError.validator``""" +__all__ = [ + "Optional", # altair.utils + "SchemaBase", # altair.vegalite.v5.schema.core + "Undefined", # altair.utils + "UndefinedType", # altair.vegalite.v5.schema.core -> (side-effect relied on to propagate to alt.__init__) + "_resolve_references", # tools.schemapi.utils -> tools.generate_schema_wrapper + "_subclasses", # altair.vegalite.v5.schema.core + "is_undefined", # altair.typing + "validate_jsonschema", # altair.utils.display + "with_property_setters", # altair.vegalite.v5.schema.channels +] _VEGA_LITE_ROOT_URI: Final = "urn:vega-lite-schema" """ @@ -308,9 +316,12 @@ def _validator( rootschema Context to evaluate within. + We have **both** a current & a backwards-compatible version of this function. + .. _Validator: https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol """ + # NOTE: This is the current version uri = _get_schema_dialect_uri(rootschema or schema) validator = _validator_for(uri) registry = _registry(rootschema or schema, uri) @@ -350,7 +361,17 @@ def _registry_update( def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: - """Resolve schema references until there is no $ref anymore in the top-level of the dictionary.""" + """ + Resolve schema references until there is no ``"$ref"`` anymore in the top-level ``dict``. + + ``jsonschema`` deprecated ``RefResolver`` in favor of `referencing`_. + + We have **both** a current & a backwards-compatible version of this function. + + .. _referencing: + https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + """ + # NOTE: This is the current version root = rootschema or schema if ("$ref" not in root) or ("$ref" not in schema): return schema @@ -391,6 +412,22 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: def _validator( schema: dict[str, Any], rootschema: dict[str, Any] | None = None ) -> Validator: + """ + Constructs a `Validator`_ for future validation. + + We have **both** a current & a backwards-compatible version of this function. + + Parameters + ---------- + schema + Schema that a spec will be validated against. + rootschema + Context to evaluate within. + + .. _Validator: + https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol + """ + # NOTE: This is the backwards-compatible version validator = _validator_for(_get_schema_dialect_uri(rootschema or schema)) resolver: Any = ( jsonschema.RefResolver.from_schema(rootschema) if rootschema else rootschema @@ -401,12 +438,16 @@ def _resolve_references( schema: dict[str, Any], rootschema: dict[str, Any] ) -> dict[str, Any]: """ - Resolve schema references until there is no $ref anymore in the top-level of the dictionary. + Resolve schema references until there is no ``"$ref"`` anymore in the top-level ``dict``. - ``jsonschema`` deprecated ``RefResolver`` in favor of ``referencing``. + ``jsonschema`` deprecated ``RefResolver`` in favor of `referencing`_. - See https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + We have **both** a current & a backwards-compatible version of this function. + + .. _referencing: + https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 """ + # NOTE: This is the backwards-compatible version resolver = jsonschema.RefResolver.from_schema(rootschema or schema) while "$ref" in schema: with resolver.resolving(schema["$ref"]) as resolved: @@ -607,17 +648,6 @@ def _prune_subset_enum(iterable: _Errs, /) -> _ErrsLazy: } -def _subclasses(cls: type[Any]) -> Iterator[type[Any]]: - """Breadth-first sequence of all classes which inherit from cls.""" - seen = set() - current: set[type[Any]] = {cls} - while current: - seen |= current - current = set(chain.from_iterable(cls.__subclasses__() for cls in current)) - for cls in current - seen: - yield cls - - def _from_array_like(obj: Iterable[Any], /) -> list[Any]: try: ser = nw.from_native(obj, strict=True, series_only=True) @@ -971,6 +1001,8 @@ class SchemaBase: the _rootschema class attribute) which is used for validation. """ + # TODO: Implement `ClassVar` validation using https://peps.python.org/pep-0487/ + _schema: ClassVar[dict[str, Any] | Any] = None _rootschema: ClassVar[dict[str, Any] | Any] = None _class_is_valid_at_instantiation: ClassVar[bool] = True @@ -1249,17 +1281,18 @@ def from_dict( """ if validate: cls.validate(dct) - converter = _FromDict(cls._default_wrapper_classes()) + # NOTE: the breadth-first search occurs only once now + # `_FromDict` is purely ClassVar/classmethods + converter: type[_FromDict] | _FromDict = ( + _FromDict + if _FromDict.hash_tps + else _FromDict(cls._default_wrapper_classes()) + ) return converter.from_dict(dct, cls) @classmethod def from_json( - cls, - json_string: str, - validate: bool = True, - **kwargs: Any, - # Type hints for this method would get rather complicated - # if we want to provide a more specific return type + cls, json_string: str, validate: bool = True, **kwargs: Any ) -> ChartType: """ Instantiate the object from a valid JSON string. @@ -1355,6 +1388,9 @@ def _passthrough(*args: Any, **kwds: Any) -> Any | dict[str, Any]: def _freeze(val): + # NOTE: No longer referenced + # - Previously only called during tests + # - Not during any library code if isinstance(val, dict): return frozenset((k, _freeze(v)) for k, v in val.items()) elif isinstance(val, set): @@ -1365,6 +1401,64 @@ def _freeze(val): return val +def _hash_schema( + schema: _JsonParameter, + /, + *, + exclude: Iterable[str] = frozenset( + ("definitions", "title", "description", "$schema", "id") + ), +) -> int: + """ + Return the hash value for a ``schema``. + + Parameters + ---------- + schema + ``SchemaBase._schema``. + exclude + ``schema`` keys which are not considered when identifying equivalence. + """ + if isinstance(schema, Mapping): + schema = {k: v for k, v in schema.items() if k not in exclude} + return hash(json.dumps(schema, sort_keys=True)) + + +def _subclasses(cls: type[TSchemaBase]) -> Iterator[type[TSchemaBase]]: + """ + Breadth-first sequence of all classes which inherit from ``cls``. + + Notes + ----- + - `__subclasses__()` alone isn't helpful, as that is only immediate subclasses + - Deterministic + - Used for `SchemaBase` & `VegaLiteSchema` + - In practice, it provides an iterator over all classes in the schema below `VegaLiteSchema` + - The first one is `Root` + - The order itself, I don't think is important + - But probably important that it doesn't change + - Thinking they used an iterator so that the subclasses are evaluated after they have all been defined + + - `Chart` seems to try to avoid calling this + - Using `TopLevelMixin.__subclasses__()` first if possible + - It is always called during `Chart.encode()` + - Chart.encode() + - altair.utils.core.infer_encoding_types + - _ChannelCache.infer_encoding_types + - _ChannelCache._wrap_in_channel + - SchemaBase.from_dict (recursive, hot loop, validate =False, within a try/except) + - _FromDict(cls._default_wrapper_classes()) + - schemapi._subclasses(schema.core.VegaLiteSchema) + """ + seen = set() + current: set[type[TSchemaBase]] = {cls} + while current: + seen |= current + current = set(chain.from_iterable(cls.__subclasses__() for cls in current)) + for cls in current - seen: + yield cls + + class _FromDict: """ Class used to construct SchemaBase class hierarchies from a dict. @@ -1374,40 +1468,31 @@ class _FromDict: specified in the ``wrapper_classes`` positional-only argument to the constructor. """ - _hash_exclude_keys = ("definitions", "title", "description", "$schema", "id") + hash_tps: ClassVar[defaultdict[int, deque[type[SchemaBase]]]] = defaultdict(deque) + """ + Maps unique schemas to corresponding types. - def __init__(self, wrapper_classes: Iterable[type[SchemaBase]], /) -> None: - # Create a mapping of a schema hash to a list of matching classes - # This lets us quickly determine the correct class to construct - self.class_dict: dict[int, list[type[SchemaBase]]] = defaultdict(list) - for tp in wrapper_classes: - if tp._schema is not None: - self.class_dict[self.hash_schema(tp._schema)].append(tp) + The logic is that after removing a subset of keys, some schemas are identical. - @classmethod - def hash_schema(cls, schema: dict[str, Any], use_json: bool = True) -> int: - """ - Compute a python hash for a nested dictionary which properly handles dicts, lists, sets, and tuples. + If there are multiple matches, we use the first one in the ``deque``. - At the top level, the function excludes from the hashed schema all keys - listed in `exclude_keys`. + ``_subclasses`` yields the results of a `breadth-first search`_, + so the first matching class is the most general match. - This implements two methods: one based on conversion to JSON, and one based - on recursive conversions of unhashable to hashable types; the former seems - to be slightly faster in several benchmarks. - """ - if cls._hash_exclude_keys and isinstance(schema, dict): - schema = { - key: val - for key, val in schema.items() - if key not in cls._hash_exclude_keys - } - s: Any = json.dumps(schema, sort_keys=True) if use_json else _freeze(schema) - return hash(s) + .. _breadth-first search: + https://en.wikipedia.org/wiki/Breadth-first_search + """ + + def __init__(self, wrapper_classes: Iterator[type[SchemaBase]], /) -> None: + cls = type(self) + for tp in wrapper_classes: + if tp._schema is not None: + cls.hash_tps[_hash_schema(tp._schema)].append(tp) @overload + @classmethod def from_dict( - self, + cls, dct: TSchemaBase, tp: None = ..., schema: None = ..., @@ -1415,8 +1500,9 @@ def from_dict( default_class: Any = ..., ) -> TSchemaBase: ... @overload + @classmethod def from_dict( - self, + cls, dct: dict[str, Any] | list[dict[str, Any]], tp: Any = ..., schema: Any = ..., @@ -1424,8 +1510,9 @@ def from_dict( default_class: type[TSchemaBase] = ..., # pyright: ignore[reportInvalidTypeVarUse] ) -> TSchemaBase: ... @overload + @classmethod def from_dict( - self, + cls, dct: dict[str, Any], tp: None = ..., schema: dict[str, Any] = ..., @@ -1433,8 +1520,9 @@ def from_dict( default_class: Any = ..., ) -> SchemaBase: ... @overload + @classmethod def from_dict( - self, + cls, dct: dict[str, Any], tp: type[TSchemaBase], schema: None = ..., @@ -1442,16 +1530,18 @@ def from_dict( default_class: Any = ..., ) -> TSchemaBase: ... @overload + @classmethod def from_dict( - self, + cls, dct: dict[str, Any] | list[dict[str, Any]], tp: type[TSchemaBase], schema: dict[str, Any], rootschema: dict[str, Any] | None = ..., default_class: Any = ..., ) -> Never: ... + @classmethod def from_dict( - self, + cls, dct: dict[str, Any] | list[dict[str, Any]] | TSchemaBase, tp: type[TSchemaBase] | None = None, schema: dict[str, Any] | None = None, @@ -1468,28 +1558,26 @@ def from_dict( root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: - # If there are multiple matches, we use the first one in the dict. - # Our class dict is constructed breadth-first from top to bottom, - # so the first class that matches is the most general match. current_schema = schema root_schema = rootschema or current_schema - matches = self.class_dict[self.hash_schema(current_schema)] - target_tp = matches[0] if matches else default_class + matches = cls.hash_tps[_hash_schema(current_schema)] + target_tp = next(iter(matches), default_class) else: msg = "Must provide either `tp` or `schema`, but not both." raise ValueError(msg) - from_dict = partial(self.from_dict, rootschema=root_schema) + from_dict = partial(cls.from_dict, rootschema=root_schema) # Can also return a list? resolved = _resolve_references(current_schema, root_schema) if "anyOf" in resolved or "oneOf" in resolved: schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) for possible in schemas: - try: - validate_jsonschema_fail_fast(dct, possible, rootschema=root_schema) - except ValidationError: - continue - else: + # NOTE: Instead of raise/except/continue + # Pre-"zero-cost" exceptions, this has a huge performance gain. + # https://docs.python.org/3/whatsnew/3.11.html#misc + # https://github.com/python/cpython/blob/9b3749849eda4012261a112b22eb07f26fd345a9/InternalDocs/exception_handling.md + it_errs = _validator(possible, root_schema).iter_errors(dct) + if next(it_errs, None) is None: return from_dict(dct, schema=possible, default_class=target_tp) if _is_dict(dct): From 677017702c5548ffba2f0a0b6aee7bf2860f944e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 15:22:28 +0100 Subject: [PATCH 49/92] perf(ruff): Add & ignore some performance rules Ignoring these *first* for visibility, since I'll be making changes that otherwise have no context --- altair/utils/_transformed_data.py | 4 ++-- altair/utils/schemapi.py | 4 ++-- altair/vegalite/v5/api.py | 4 ++-- pyproject.toml | 10 ++++++++++ sphinxext/schematable.py | 2 +- tools/generate_schema_wrapper.py | 4 ++-- tools/schemapi/schemapi.py | 4 ++-- 7 files changed, 21 insertions(+), 11 deletions(-) diff --git a/altair/utils/_transformed_data.py b/altair/utils/_transformed_data.py index 3839a13d2..d3db8e62e 100644 --- a/altair/utils/_transformed_data.py +++ b/altair/utils/_transformed_data.py @@ -214,7 +214,7 @@ def name_views( chart_names: list[str] = [] for subchart in subcharts: for name in name_views(subchart, i=i + len(chart_names), exclude=exclude): - chart_names.append(name) + chart_names.append(name) # noqa: PERF402 return chart_names @@ -326,7 +326,7 @@ def get_datasets_for_scope(vega_spec: dict[str, Any], scope: Scope) -> list[str] # get datasets from group datasets = [] for dataset in group.get("data", []): - datasets.append(dataset["name"]) + datasets.append(dataset["name"]) # noqa: PERF401 # Add facet dataset facet_dataset = group.get("from", {}).get("facet", {}).get("name", None) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 5188863ee..666119927 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -723,7 +723,7 @@ def _get_message(self) -> str: error_messages = [] for group in group_1, list(group_2), next(it, None): if group is not None: - error_messages.append(self._get_message_for_errors_group(group)) + error_messages.append(self._get_message_for_errors_group(group)) # noqa: PERF401 message = "\n\n".join( self.indent_from_second_line(f"Error {error_id}: {m}") for error_id, m in enumerate(error_messages, start=1) @@ -850,7 +850,7 @@ def _get_default_error_message( if "enum" in errors_by_validator: for error in errors_by_validator["enum"]: - bullet_points.append(f"one of {error.validator_value}") + bullet_points.append(f"one of {error.validator_value}") # noqa: PERF401 if "type" in errors_by_validator: types = [f"'{err.validator_value}'" for err in errors_by_validator["type"]] diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index d352b060b..43c1a2ccf 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -482,7 +482,7 @@ def check_fields_and_encodings(parameter: Parameter, field_name: str) -> bool: try: if field_name in getattr(param.select, prop): return True - except (AttributeError, TypeError): + except (AttributeError, TypeError): # noqa: PERF203 pass return False @@ -4871,7 +4871,7 @@ def remove_prop(subchart: ChartType, prop: str) -> ChartType: val = c[prop] if val is not Undefined: values.append(val) - except KeyError: + except KeyError: # noqa: PERF203 pass if len(values) == 0: pass diff --git a/pyproject.toml b/pyproject.toml index 0fd722574..05aade59b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -257,8 +257,16 @@ extend-select=[ "PLR1736", # literal-membership "PLR6201", + # unnecessary-lambda + "PLW0108", # unspecified-encoding "PLW1514", + # exception handling # + # ------------------ # + # try-except-pass + "S110", + # try-except-continue + "S112", ] select = [ # flake8-bugbear @@ -324,6 +332,8 @@ select = [ "I001", # complex-structure "C901", + # Perflint + "PERF", ] ignore = [ # Whitespace before ':' diff --git a/sphinxext/schematable.py b/sphinxext/schematable.py index f27622fb8..e0705ee02 100644 --- a/sphinxext/schematable.py +++ b/sphinxext/schematable.py @@ -173,7 +173,7 @@ def select_items_from_schema( for prop in props: try: yield prop, properties[prop], prop in required - except KeyError as err: + except KeyError as err: # noqa: PERF203 msg = f"Can't find property: {prop}" raise Exception(msg) from err diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index a625394bd..ee13c99e7 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -394,7 +394,7 @@ def _add_shorthand_property_to_field_encodings(schema: dict) -> dict: encoding = SchemaInfo(schema["definitions"][encoding_def], rootschema=schema) - for _, propschema in encoding.properties.items(): + for _, propschema in encoding.properties.items(): # noqa: PERF102 def_dict = get_field_datum_value_defs(propschema, schema) field_ref = def_dict.get("field") @@ -566,7 +566,7 @@ def generate_vegalite_schema_wrapper(schema_file: Path) -> str: ] for name in toposort(graph): - contents.append(definitions[name].schema_class()) + contents.append(definitions[name].schema_class()) # noqa: PERF401 contents.append("") # end with newline return "\n".join(contents) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 29fe8be64..a5f661013 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -721,7 +721,7 @@ def _get_message(self) -> str: error_messages = [] for group in group_1, list(group_2), next(it, None): if group is not None: - error_messages.append(self._get_message_for_errors_group(group)) + error_messages.append(self._get_message_for_errors_group(group)) # noqa: PERF401 message = "\n\n".join( self.indent_from_second_line(f"Error {error_id}: {m}") for error_id, m in enumerate(error_messages, start=1) @@ -848,7 +848,7 @@ def _get_default_error_message( if "enum" in errors_by_validator: for error in errors_by_validator["enum"]: - bullet_points.append(f"one of {error.validator_value}") + bullet_points.append(f"one of {error.validator_value}") # noqa: PERF401 if "type" in errors_by_validator: types = [f"'{err.validator_value}'" for err in errors_by_validator["type"]] From 0ddf19eb23b18308cf7da6250f9b25a5c7311711 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:51:56 +0100 Subject: [PATCH 50/92] perf(ruff): Fix some `"PERF"` violations --- altair/utils/_transformed_data.py | 9 ++++---- altair/utils/schemapi.py | 17 ++++++++------- altair/vegalite/v5/api.py | 36 ++++++++++--------------------- tools/schemapi/schemapi.py | 17 ++++++++------- 4 files changed, 33 insertions(+), 46 deletions(-) diff --git a/altair/utils/_transformed_data.py b/altair/utils/_transformed_data.py index d3db8e62e..43d398575 100644 --- a/altair/utils/_transformed_data.py +++ b/altair/utils/_transformed_data.py @@ -213,8 +213,9 @@ def name_views( chart_names: list[str] = [] for subchart in subcharts: - for name in name_views(subchart, i=i + len(chart_names), exclude=exclude): - chart_names.append(name) # noqa: PERF402 + chart_names.extend( + name_views(subchart, i=i + len(chart_names), exclude=exclude) + ) return chart_names @@ -324,9 +325,7 @@ def get_datasets_for_scope(vega_spec: dict[str, Any], scope: Scope) -> list[str] group = get_group_mark_for_scope(vega_spec, scope) or {} # get datasets from group - datasets = [] - for dataset in group.get("data", []): - datasets.append(dataset["name"]) # noqa: PERF401 + datasets = [dataset["name"] for dataset in group.get("data", [])] # Add facet dataset facet_dataset = group.get("from", {}).get("facet", {}).get("name", None) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 666119927..87f0b6363 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -717,18 +717,19 @@ def indent_from_second_line(msg: str, /, indent: int = 4) -> str: ) def _get_message(self) -> str: - it = self._errors + it: _ErrsLazyGroup = self._errors group_1 = list(next(it)) if (group_2 := next(it, None)) is not None: - error_messages = [] - for group in group_1, list(group_2), next(it, None): - if group is not None: - error_messages.append(self._get_message_for_errors_group(group)) # noqa: PERF401 - message = "\n\n".join( + messages: Iterator[str] = ( + self._get_message_for_errors_group(g) + for g in (group_1, list(group_2), next(it, None)) + if g is not None + ) + msg = "\n\n".join( self.indent_from_second_line(f"Error {error_id}: {m}") - for error_id, m in enumerate(error_messages, start=1) + for error_id, m in enumerate(messages, start=1) ) - return f"Multiple errors were found.\n\n{message}" + return f"Multiple errors were found.\n\n{msg}" else: return self._get_message_for_errors_group(group_1) diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index 43c1a2ccf..b18216dbb 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -426,7 +426,7 @@ def __getattr__(self, field_name: str) -> GetAttrExpression | SelectionExpressio # fields or encodings list, then we want to return an expression. if check_fields_and_encodings(self, field_name): return SelectionExpression(_attrexpr) - return _expr_core.GetAttrExpression(self.name, field_name) + return _attrexpr # TODO: Are there any special cases to consider for __getitem__? # This was copied from v4. @@ -478,13 +478,10 @@ def check_fields_and_encodings(parameter: Parameter, field_name: str) -> bool: param = parameter.param if utils.is_undefined(param) or isinstance(param, core.VariableParameter): return False - for prop in ["fields", "encodings"]: - try: - if field_name in getattr(param.select, prop): - return True - except (AttributeError, TypeError): # noqa: PERF203 - pass - + select = param.select + for prop in "fields", "encodings": + if not utils.is_undefined(p := select._get(prop)) and field_name in p: + return True return False @@ -4841,17 +4838,13 @@ def _repeat_names( return params_named -def _remove_layer_props( # noqa: C901 +def _remove_layer_props( chart: LayerChart, subcharts: list[ChartType], layer_props: Iterable[str] ) -> tuple[dict[str, Any], list[ChartType]]: def remove_prop(subchart: ChartType, prop: str) -> ChartType: - # If subchart is a UnitSpec, then subchart["height"] raises a KeyError - try: - if subchart[prop] is not Undefined: - subchart = subchart.copy() - subchart[prop] = Undefined - except KeyError: - pass + if not utils.is_undefined(subchart._get(prop)): + subchart = subchart.copy() + subchart[prop] = Undefined return subchart output_dict: dict[str, Any] = {} @@ -4864,15 +4857,8 @@ def remove_prop(subchart: ChartType, prop: str) -> ChartType: if chart[prop] is Undefined: # Top level does not have this prop. # Check for consistent props within the subcharts. - values = [] - for c in subcharts: - # If c is a UnitSpec, then c["height"] raises a KeyError. - try: - val = c[prop] - if val is not Undefined: - values.append(val) - except KeyError: # noqa: PERF203 - pass + values = [v for c in subcharts if not utils.is_undefined(v := c._get(prop))] + if len(values) == 0: pass elif all(v == values[0] for v in values[1:]): diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index a5f661013..0ae05d4e0 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -715,18 +715,19 @@ def indent_from_second_line(msg: str, /, indent: int = 4) -> str: ) def _get_message(self) -> str: - it = self._errors + it: _ErrsLazyGroup = self._errors group_1 = list(next(it)) if (group_2 := next(it, None)) is not None: - error_messages = [] - for group in group_1, list(group_2), next(it, None): - if group is not None: - error_messages.append(self._get_message_for_errors_group(group)) # noqa: PERF401 - message = "\n\n".join( + messages: Iterator[str] = ( + self._get_message_for_errors_group(g) + for g in (group_1, list(group_2), next(it, None)) + if g is not None + ) + msg = "\n\n".join( self.indent_from_second_line(f"Error {error_id}: {m}") - for error_id, m in enumerate(error_messages, start=1) + for error_id, m in enumerate(messages, start=1) ) - return f"Multiple errors were found.\n\n{message}" + return f"Multiple errors were found.\n\n{msg}" else: return self._get_message_for_errors_group(group_1) From cace782a11c2fee7e4efa06d4e5a6a626a474dee Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 20:50:06 +0100 Subject: [PATCH 51/92] perf: Remove unreachable `ValidationError` except This code path will not raise `jsonschema.ValidationError` --- altair/utils/core.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/altair/utils/core.py b/altair/utils/core.py index f5ef659b1..8df22b154 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -14,7 +14,6 @@ from operator import itemgetter from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, TypeVar, cast -import jsonschema import narwhals.stable.v1 as nw from narwhals.dependencies import get_polars, is_pandas_dataframe from narwhals.typing import IntoDataFrame @@ -855,13 +854,9 @@ def _wrap_in_channel(self, obj: Any, encoding: str, /): return [self._wrap_in_channel(el, encoding) for el in obj] if channel := self.name_to_channel.get(encoding): tp = channel["value" if "value" in obj else "field"] - try: - # Don't force validation here; some objects won't be valid until - # they're created in the context of a chart. - return tp.from_dict(obj, validate=False) - except jsonschema.ValidationError: - # our attempts at finding the correct class have failed - return obj + # Don't force validation here; some objects won't be valid until + # they're created in the context of a chart. + return tp.from_dict(obj, validate=False) else: warnings.warn(f"Unrecognized encoding channel {encoding!r}", stacklevel=1) return obj From ac9993907c78011881cff863dc73afa5e7d6b595 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 22:30:54 +0100 Subject: [PATCH 52/92] docs(perf): Note some areas that may impact performance --- altair/utils/_importers.py | 2 ++ altair/utils/core.py | 1 + altair/utils/schemapi.py | 1 + altair/vegalite/v5/api.py | 3 +++ tests/vegalite/v5/test_api.py | 5 ++++- tools/schemapi/schemapi.py | 1 + 6 files changed, 12 insertions(+), 1 deletion(-) diff --git a/altair/utils/_importers.py b/altair/utils/_importers.py index 14085ebcf..93e647f33 100644 --- a/altair/utils/_importers.py +++ b/altair/utils/_importers.py @@ -76,6 +76,7 @@ def vl_version_for_vl_convert() -> str: def import_pyarrow_interchange() -> ModuleType: min_version = "11.0.0" + # FIXME: Hot try/except try: version = importlib_version("pyarrow") @@ -102,6 +103,7 @@ def import_pyarrow_interchange() -> ModuleType: def pyarrow_available() -> bool: + # FIXME: Hot try/except try: import_pyarrow_interchange() return True diff --git a/altair/utils/core.py b/altair/utils/core.py index 8df22b154..a1d81c39e 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -830,6 +830,7 @@ def from_channels(cls, channels: ModuleType, /) -> _ChannelCache: @classmethod def from_cache(cls) -> _ChannelCache: global _CHANNEL_CACHE + # FIXME: Hot try/except try: cached = _CHANNEL_CACHE except NameError: diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 87f0b6363..d0006a3ac 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1073,6 +1073,7 @@ def __getattr__(self, attr): if attr in self._kwds: return self._kwds[attr] else: + # FIXME: Hot try/except try: _getattr = super().__getattr__ # pyright: ignore[reportAttributeAccessIssue] except AttributeError: diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index b18216dbb..695fcc642 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -1790,6 +1790,7 @@ def to_dict( # noqa: C901 copy = _top_schema_base(self).copy(deep=False) original_data = getattr(copy, "data", Undefined) if not utils.is_undefined(original_data): + # FIXME: Hot try/except try: data = _to_eager_narwhals_dataframe(original_data) except TypeError: @@ -3401,6 +3402,7 @@ def _repr_mimebundle_(self, *args, **kwds) -> MimeBundleType | None: # type:ign """Return a MIME bundle for display in Jupyter frontends.""" # Catch errors explicitly to get around issues in Jupyter frontend # see https://github.com/ipython/ipython/issues/11038 + # FIXME: Hot try/except try: dct = self.to_dict(context={"pre_transform": False}) except Exception: @@ -3713,6 +3715,7 @@ def from_dict( _tp: Any for tp in TopLevelMixin.__subclasses__(): _tp = super() if tp is Chart else tp + # FIXME: Hot try/except try: return _tp.from_dict(dct, validate=validate) except jsonschema.ValidationError: diff --git a/tests/vegalite/v5/test_api.py b/tests/vegalite/v5/test_api.py index f5a150556..8b71e6e01 100644 --- a/tests/vegalite/v5/test_api.py +++ b/tests/vegalite/v5/test_api.py @@ -1210,7 +1210,10 @@ def test_themes(): assert "config" not in chart.to_dict() -def test_chart_from_dict(): +# TODO: Investigate alternative to looped try/except/pass +# - AFAIK it would speed up `Chart.from_dict()` +# - but maybe not central enough to have general impact +def test_chart_from_dict() -> None: base = alt.Chart("data.csv").mark_point().encode(x="x:Q", y="y:Q") charts = [ diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 0ae05d4e0..21d49851c 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1071,6 +1071,7 @@ def __getattr__(self, attr): if attr in self._kwds: return self._kwds[attr] else: + # FIXME: Hot try/except try: _getattr = super().__getattr__ # pyright: ignore[reportAttributeAccessIssue] except AttributeError: From f776fcfaffb43aa95d6db7bbfd53a2e3211b5f80 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 25 Aug 2024 22:34:07 +0100 Subject: [PATCH 53/92] refactor: Remove now-unused `validate_jsonschema_fail_fast` --- altair/utils/schemapi.py | 16 ---------------- tools/schemapi/schemapi.py | 16 ---------------- 2 files changed, 32 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index d0006a3ac..a2406586f 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -193,22 +193,6 @@ def validate_jsonschema( raise NotImplementedError(msg) -def validate_jsonschema_fail_fast( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, -) -> None: - """ - Raise as quickly as possible. - - Use instead of ``validate_jsonschema`` when any information about the error(s) are not needed. - """ - if ( - err := next(_validator(schema, rootschema).iter_errors(spec), None) - ) is not None: - raise err - - def _get_schema_dialect_uri(schema: dict[str, Any]) -> str: """ Return value of `$schema`_. diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 21d49851c..b1c2583e3 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -191,22 +191,6 @@ def validate_jsonschema( raise NotImplementedError(msg) -def validate_jsonschema_fail_fast( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, -) -> None: - """ - Raise as quickly as possible. - - Use instead of ``validate_jsonschema`` when any information about the error(s) are not needed. - """ - if ( - err := next(_validator(schema, rootschema).iter_errors(spec), None) - ) is not None: - raise err - - def _get_schema_dialect_uri(schema: dict[str, Any]) -> str: """ Return value of `$schema`_. From 53663e811e3839d30d4bf9591300ff8775640473 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:27:06 +0100 Subject: [PATCH 54/92] perf: Avoid an exception in `SchemaBase.__getattr__` Uses the default parameter of `getattr` instead --- altair/utils/schemapi.py | 9 ++------- tools/schemapi/schemapi.py | 9 ++------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 12c7b8009..4b975aa14 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1057,15 +1057,10 @@ def __getattr__(self, attr): # reminder: getattr is called after the normal lookups if attr == "_kwds": raise AttributeError() - if attr in self._kwds: + elif attr in self._kwds: return self._kwds[attr] else: - # FIXME: Hot try/except - try: - _getattr = super().__getattr__ # pyright: ignore[reportAttributeAccessIssue] - except AttributeError: - _getattr = super().__getattribute__ - return _getattr(attr) + return getattr(super(), "__getattr__", super().__getattribute__)(attr) def __setattr__(self, item, val) -> None: self._kwds[item] = val diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 3068d7558..04ba02a95 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1055,15 +1055,10 @@ def __getattr__(self, attr): # reminder: getattr is called after the normal lookups if attr == "_kwds": raise AttributeError() - if attr in self._kwds: + elif attr in self._kwds: return self._kwds[attr] else: - # FIXME: Hot try/except - try: - _getattr = super().__getattr__ # pyright: ignore[reportAttributeAccessIssue] - except AttributeError: - _getattr = super().__getattribute__ - return _getattr(attr) + return getattr(super(), "__getattr__", super().__getattribute__)(attr) def __setattr__(self, item, val) -> None: self._kwds[item] = val From bd31d7c574820ec3ee3bf72ec874c96545f415ba Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:39:46 +0100 Subject: [PATCH 55/92] refactor: Ensure every `VegaLiteSchema` has a `._schema` Eventually, this will replace the need for checking `None` as frequently --- altair/utils/schemapi.py | 2 +- altair/vegalite/v5/schema/core.py | 3 +-- tools/generate_schema_wrapper.py | 17 ++++++++++++++--- tools/schemapi/schemapi.py | 2 +- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 4b975aa14..e7f16098f 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1310,7 +1310,7 @@ def validate( @classmethod def resolve_references(cls, schema: dict[str, Any] | None = None) -> dict[str, Any]: """Resolve references in the context of this object's schema or root schema.""" - rootschema = cls._rootschema or cls._schema or schema + rootschema = cls._rootschema or cls._schema if rootschema is None: name = type(cls).__name__ msg = ( diff --git a/altair/vegalite/v5/schema/core.py b/altair/vegalite/v5/schema/core.py index 0892e7214..833a551ba 100644 --- a/altair/vegalite/v5/schema/core.py +++ b/altair/vegalite/v5/schema/core.py @@ -487,6 +487,7 @@ def load_schema() -> dict: class VegaLiteSchema(SchemaBase): + _schema = load_schema() _rootschema = load_schema() @classmethod @@ -502,8 +503,6 @@ class Root(VegaLiteSchema): specifications. (The json schema is generated from this type.) """ - _schema = VegaLiteSchema._rootschema - def __init__(self, *args, **kwds): super().__init__(*args, **kwds) diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index ee13c99e7..f4b970902 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -54,6 +54,7 @@ BASE_SCHEMA: Final = """ class {basename}(SchemaBase): + _schema = load_schema() _rootschema = load_schema() @classmethod def _default_wrapper_classes(cls) -> Iterator[type[Any]]: @@ -301,6 +302,17 @@ def process_description(description: str) -> str: return description.strip() +class RootSchemaGenerator(SchemaGenerator): + schema_class_template = textwrap.dedent( + ''' + class {classname}({basename}): + """{docstring}""" + + {init_code} + ''' + ) + + class FieldSchemaGenerator(SchemaGenerator): schema_class_template = textwrap.dedent( ''' @@ -557,12 +569,11 @@ def generate_vegalite_schema_wrapper(schema_file: Path) -> str: "\n" f"__all__ = {all_}\n", LOAD_SCHEMA.format(schemafile="vega-lite-schema.json"), BASE_SCHEMA.format(basename=basename), - schema_class( + RootSchemaGenerator( "Root", schema=rootschema, basename=basename, - schemarepr=CodeSnippet(f"{basename}._rootschema"), - ), + ).schema_class(), ] for name in toposort(graph): diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 04ba02a95..956302519 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1308,7 +1308,7 @@ def validate( @classmethod def resolve_references(cls, schema: dict[str, Any] | None = None) -> dict[str, Any]: """Resolve references in the context of this object's schema or root schema.""" - rootschema = cls._rootschema or cls._schema or schema + rootschema = cls._rootschema or cls._schema if rootschema is None: name = type(cls).__name__ msg = ( From 06ca33929c5f2a5f54d4f15f872844a8c540f605 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:45:20 +0100 Subject: [PATCH 56/92] refactor: Adds `_SchemaBasePEP487` Related https://github.com/vega/altair/pull/3547#discussion_r1731037778 --- altair/utils/schemapi.py | 126 +++++++++++++++++++++++++++++++++++++ tools/schemapi/schemapi.py | 126 +++++++++++++++++++++++++++++++++++++ 2 files changed, 252 insertions(+) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index e7f16098f..12314089e 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -981,6 +981,132 @@ def _deep_copy(obj: _CopyImpl | Any, by_ref: set[str]) -> _CopyImpl | Any: return obj +class _SchemaBasePEP487: + """Minimal demo for testing feasibility of `__init_subclass__`.""" + + _schema: ClassVar[dict[str, Any]] + _rootschema: ClassVar[dict[str, Any]] + _class_is_valid_at_instantiation: ClassVar[bool] = True + + def __init__(self, *args: Any, **kwds: Any) -> None: + if (kwds and args) or len(args) > 1: + name = type(self).__name__ + _args = ", ".join(f"{a!r}" for a in args) + _kwds = ", ".join(f"{k}={v!r}" for k, v in kwds.items()) + msg = ( + f"Expected either:\n" + f" - a single arg with no kwds, for, e.g. {{'type': 'string'}}\n" + f" - zero args with zero or more kwds for {{'type': 'object'}}\n\n" + f"but got: {name}({_args}, {_kwds})" + ) + raise AssertionError(msg) + # use object.__setattr__ because we override setattr below. + self._args: tuple[Any, ...] + self._kwds: dict[str, Any] + object.__setattr__(self, "_args", args) + object.__setattr__(self, "_kwds", kwds) + + def __init_subclass__( + cls, + *args: Any, + schema: dict[str, Any] | None = None, + rootschema: dict[str, Any] | None = None, + valid_at_init: bool | None = None, + **kwds: Any, + ) -> None: + super().__init_subclass__(*args, **kwds) + # NOTE: `SchemaBase` itself would have no `_schema` or `_rootschema`, but won't be run through this + # FIXED: `VegaLiteSchema` has a `_rootschema` but no `_schema` + # FIXED: `Root` uses `VegaLiteSchema._rootschema`, for `_schema` and inherits the same for `_rootschema` + # FIXED: Both have only `_schema` - which is a type + # - `api.Then`: _schema = {"type": "object"} + # - `expr.core.Expression`: _schema = {"type": "string"} + # ---- + # All others either *only* define `_schema`, or inherit it when they are a channel + if schema is None: + if hasattr(cls, "_schema"): + schema = cls._schema + else: + msg = ( + f"Cannot instantiate object of type {cls}: " + "_schema class attribute is not defined." + ) + raise TypeError(msg) + + if rootschema is None: + if hasattr(cls, "_rootschema"): + rootschema = cls._rootschema + elif "$ref" not in schema: + rootschema = schema + else: + msg = "`rootschema` must be provided if `schema` contains a `'$ref'` and does not inherit one." + raise TypeError(msg) + + # NOTE: Inherit a `False`instead of overwriting with the default `True` + # - If a parent is not valid at init, then none of its subclasses can be + # - The current hierarchy does not support the inverse of this + # - Subclasses may declare they are not valid + if valid_at_init is None: + valid_at_init = cls._class_is_valid_at_instantiation + cls._schema = schema + cls._rootschema = rootschema + cls._class_is_valid_at_instantiation = valid_at_init + + @overload + def _get(self, attr: str, default: Optional = ...) -> Any | UndefinedType: ... + @overload + def _get(self, attr: str, default: T) -> Any | T: ... + def _get(self, attr: str, default: Optional[T] = Undefined) -> Any | T: + """Get an attribute, returning default if not present.""" + if (item := self._kwds.get(attr, Undefined)) is not Undefined: + return item + else: + return default + + def __dir__(self) -> list[str]: + return sorted(chain(super().__dir__(), self._kwds)) + + def __eq__(self, other: Any) -> bool: + return ( + type(self) is type(other) + and self._args == other._args + and self._kwds == other._kwds + ) + + def __getattr__(self, attr: str): + # reminder: getattr is called after the normal lookups + if attr == "_kwds": + raise AttributeError() + if attr in self._kwds: + return self._kwds[attr] + else: + return getattr(super(), "__getattr__", super().__getattribute__)(attr) + + def __getitem__(self, item: str) -> Any: + return self._kwds[item] + + def __setattr__(self, item: str, val: Any) -> None: + if item.startswith("_"): + # Setting an instances copy of a ClassVar modify that + # By default, this makes **another** copy and places in _kwds + object.__setattr__(self, item, val) + else: + self._kwds[item] = val + + def __setitem__(self, item: str, val: Any) -> None: + self._kwds[item] = val + + def __repr__(self) -> str: + name = type(self).__name__ + if kwds := self._kwds: + it = (f"{k}: {v!r}" for k, v in sorted(kwds.items()) if v is not Undefined) + args = ",\n".join(it).replace("\n", "\n ") + LB, RB = "{", "}" + return f"{name}({LB}\n {args}\n{RB})" + else: + return f"{name}({self._args[0]!r})" + + class SchemaBase: """ Base class for schema wrappers. diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 956302519..19326d81b 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -979,6 +979,132 @@ def _deep_copy(obj: _CopyImpl | Any, by_ref: set[str]) -> _CopyImpl | Any: return obj +class _SchemaBasePEP487: + """Minimal demo for testing feasibility of `__init_subclass__`.""" + + _schema: ClassVar[dict[str, Any]] + _rootschema: ClassVar[dict[str, Any]] + _class_is_valid_at_instantiation: ClassVar[bool] = True + + def __init__(self, *args: Any, **kwds: Any) -> None: + if (kwds and args) or len(args) > 1: + name = type(self).__name__ + _args = ", ".join(f"{a!r}" for a in args) + _kwds = ", ".join(f"{k}={v!r}" for k, v in kwds.items()) + msg = ( + f"Expected either:\n" + f" - a single arg with no kwds, for, e.g. {{'type': 'string'}}\n" + f" - zero args with zero or more kwds for {{'type': 'object'}}\n\n" + f"but got: {name}({_args}, {_kwds})" + ) + raise AssertionError(msg) + # use object.__setattr__ because we override setattr below. + self._args: tuple[Any, ...] + self._kwds: dict[str, Any] + object.__setattr__(self, "_args", args) + object.__setattr__(self, "_kwds", kwds) + + def __init_subclass__( + cls, + *args: Any, + schema: dict[str, Any] | None = None, + rootschema: dict[str, Any] | None = None, + valid_at_init: bool | None = None, + **kwds: Any, + ) -> None: + super().__init_subclass__(*args, **kwds) + # NOTE: `SchemaBase` itself would have no `_schema` or `_rootschema`, but won't be run through this + # FIXED: `VegaLiteSchema` has a `_rootschema` but no `_schema` + # FIXED: `Root` uses `VegaLiteSchema._rootschema`, for `_schema` and inherits the same for `_rootschema` + # FIXED: Both have only `_schema` - which is a type + # - `api.Then`: _schema = {"type": "object"} + # - `expr.core.Expression`: _schema = {"type": "string"} + # ---- + # All others either *only* define `_schema`, or inherit it when they are a channel + if schema is None: + if hasattr(cls, "_schema"): + schema = cls._schema + else: + msg = ( + f"Cannot instantiate object of type {cls}: " + "_schema class attribute is not defined." + ) + raise TypeError(msg) + + if rootschema is None: + if hasattr(cls, "_rootschema"): + rootschema = cls._rootschema + elif "$ref" not in schema: + rootschema = schema + else: + msg = "`rootschema` must be provided if `schema` contains a `'$ref'` and does not inherit one." + raise TypeError(msg) + + # NOTE: Inherit a `False`instead of overwriting with the default `True` + # - If a parent is not valid at init, then none of its subclasses can be + # - The current hierarchy does not support the inverse of this + # - Subclasses may declare they are not valid + if valid_at_init is None: + valid_at_init = cls._class_is_valid_at_instantiation + cls._schema = schema + cls._rootschema = rootschema + cls._class_is_valid_at_instantiation = valid_at_init + + @overload + def _get(self, attr: str, default: Optional = ...) -> Any | UndefinedType: ... + @overload + def _get(self, attr: str, default: T) -> Any | T: ... + def _get(self, attr: str, default: Optional[T] = Undefined) -> Any | T: + """Get an attribute, returning default if not present.""" + if (item := self._kwds.get(attr, Undefined)) is not Undefined: + return item + else: + return default + + def __dir__(self) -> list[str]: + return sorted(chain(super().__dir__(), self._kwds)) + + def __eq__(self, other: Any) -> bool: + return ( + type(self) is type(other) + and self._args == other._args + and self._kwds == other._kwds + ) + + def __getattr__(self, attr: str): + # reminder: getattr is called after the normal lookups + if attr == "_kwds": + raise AttributeError() + if attr in self._kwds: + return self._kwds[attr] + else: + return getattr(super(), "__getattr__", super().__getattribute__)(attr) + + def __getitem__(self, item: str) -> Any: + return self._kwds[item] + + def __setattr__(self, item: str, val: Any) -> None: + if item.startswith("_"): + # Setting an instances copy of a ClassVar modify that + # By default, this makes **another** copy and places in _kwds + object.__setattr__(self, item, val) + else: + self._kwds[item] = val + + def __setitem__(self, item: str, val: Any) -> None: + self._kwds[item] = val + + def __repr__(self) -> str: + name = type(self).__name__ + if kwds := self._kwds: + it = (f"{k}: {v!r}" for k, v in sorted(kwds.items()) if v is not Undefined) + args = ",\n".join(it).replace("\n", "\n ") + LB, RB = "{", "}" + return f"{name}({LB}\n {args}\n{RB})" + else: + return f"{name}({self._args[0]!r})" + + class SchemaBase: """ Base class for schema wrappers. From 56a43cb244537dc51b28d866e4d6c6b9781ac98c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:49:45 +0100 Subject: [PATCH 57/92] test: Adds a mini suite for `_SchemaBasePEP487` Purely to demonstrate the differences to status quo. There would be no loss in functionality, this would simply be some small changes to generated code and the removal of instance-level checks --- tests/utils/test_schemapi.py | 151 ++++++++++++++++++++++++++++++++++- 1 file changed, 148 insertions(+), 3 deletions(-) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 4ba3ea9ca..af4af8559 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -10,7 +10,7 @@ import warnings from collections import deque from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Sequence +from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Literal, Sequence import jsonschema import jsonschema.exceptions @@ -35,6 +35,34 @@ # try to use SchemaBase objects defined elsewhere as wrappers. +@pytest.fixture +def dummy_rootschema() -> dict[str, Any]: + return { + "$schema": _JSON_SCHEMA_DRAFT_URL, + "definitions": { + "StringMapping": { + "type": "object", + "additionalProperties": {"type": "string"}, + }, + "StringArray": {"type": "array", "items": {"type": "string"}}, + }, + "properties": { + "a": {"$ref": "#/definitions/StringMapping"}, + "a2": {"type": "object", "additionalProperties": {"type": "number"}}, + "b": {"$ref": "#/definitions/StringArray"}, + "b2": {"type": "array", "items": {"type": "number"}}, + "c": {"type": ["string", "number"]}, + "d": { + "anyOf": [ + {"$ref": "#/definitions/StringMapping"}, + {"$ref": "#/definitions/StringArray"}, + ] + }, + "e": {"items": [{"type": "string"}, {"type": "string"}]}, + }, + } + + def test_actual_json_schema_draft_is_same_as_hardcoded_default(): # See comments next to definition of `_DEFAULT_DIALECT_URI` # for details why we need this test @@ -45,6 +73,125 @@ def test_actual_json_schema_draft_is_same_as_hardcoded_default(): ) +def test_init_subclasses_hierarchy(dummy_rootschema) -> None: + from referencing.exceptions import Unresolvable + + from altair.expr.core import GetItemExpression, OperatorMixin + from altair.utils.schemapi import _SchemaBasePEP487 + + sch1 = _SchemaBasePEP487() + sch2 = _SchemaBasePEP487() + sch3 = _SchemaBasePEP487("blue") + sch4 = _SchemaBasePEP487("red") + sch5 = _SchemaBasePEP487(color="blue") + sch6 = _SchemaBasePEP487(color="red") + + with pytest.raises( + AssertionError, match=r"_SchemaBasePEP487\('blue', color='red'\)" + ): + _SchemaBasePEP487("blue", color="red") + + assert sch1 == sch2 + assert sch3 != sch4 + assert sch5 != sch6 + assert sch3 != sch5 + assert _SchemaBasePEP487("blue") == sch3 + assert _SchemaBasePEP487(color="red") == sch6 + with pytest.raises(AttributeError, match="_SchemaBasePEP487.+color"): + attempt = sch4.color is Undefined # noqa: F841 + + assert sch5.color == sch5["color"] == sch5._get("color") == "blue" + assert sch5._get("price") is Undefined + assert sch5._get("price", 999) == 999 + + assert _SchemaBasePEP487._class_is_valid_at_instantiation + sch6._class_is_valid_at_instantiation = False # type: ignore[misc] + assert ( + _SchemaBasePEP487._class_is_valid_at_instantiation + != sch6._class_is_valid_at_instantiation + ) + + with pytest.raises(TypeError, match="Test1PEP487.+ _schema"): + + class Test1PEP487(_SchemaBasePEP487): ... + + class Test2PEP487(_SchemaBasePEP487, schema={"type": "object"}): ... + + with pytest.raises( + TypeError, + match=r"`rootschema` must be provided if `schema` contains a `'\$ref'` and does not inherit one", + ): + + class Test3PEP487(_SchemaBasePEP487, schema={"$ref": "#/definitions/Bar"}): ... + + class RootParentPEP487(_SchemaBasePEP487, schema=dummy_rootschema): + @classmethod + def _default_wrapper_classes(cls) -> Iterator[type[Any]]: + return schemapi._subclasses(RootParentPEP487) + + class Root(RootParentPEP487): + """ + Root schema wrapper. + + A Vega-Lite top-level specification. This is the root class for all Vega-Lite + specifications. (The json schema is generated from this type.) + """ + + def __init__(self, *args, **kwds) -> None: + super().__init__(*args, **kwds) + + assert ( + Root._schema + == Root._rootschema + == RootParentPEP487._schema + == RootParentPEP487._rootschema + ) + + class StringMapping(Root, schema={"$ref": "#/definitions/StringMapping"}): ... + + class StringArray(Root, schema={"$ref": "#/definitions/StringArray"}): ... + + with pytest.raises( + jsonschema.ValidationError, + match=r"5 is not of type 'string'", + ): + schemapi.validate_jsonschema( + ["one", "two", 5], StringArray._schema, StringArray._rootschema + ) + + with pytest.raises(Unresolvable): + schemapi.validate_jsonschema(["one", "two", "three"], StringArray._schema) + + schemapi.validate_jsonschema( + ["one", "two", "three"], StringArray._schema, StringArray._rootschema + ) + + class Expression(OperatorMixin, _SchemaBasePEP487, schema={"type": "string"}): + def to_dict(self, *args, **kwargs): + return repr(self) + + def __setattr__(self, attr, val) -> None: + # We don't need the setattr magic defined in SchemaBase + return object.__setattr__(self, attr, val) + + def __getitem__(self, val): + return GetItemExpression(self, val) + + non_ref_mixin = Expression( + Expression("some").to_dict() + Expression("more").to_dict() + ) + schemapi.validate_jsonschema( + non_ref_mixin.to_dict(), non_ref_mixin._schema, non_ref_mixin._rootschema + ) + with pytest.raises( + jsonschema.ValidationError, + match=r"is not of type 'array'", + ): + schemapi.validate_jsonschema( + non_ref_mixin.to_dict(), StringArray._schema, StringArray._rootschema + ) + + class _TestSchema(SchemaBase): @classmethod def _default_wrapper_classes(cls): @@ -922,8 +1069,6 @@ def test_chart_validation_benchmark( from itertools import chain, repeat if TYPE_CHECKING: - from typing import Iterator - from altair.typing import ChartType def _iter_charts() -> Iterator[ChartType]: From 003c7fb14f9066ac84b96888bc50e3d3ad68fd99 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 26 Aug 2024 21:07:10 +0100 Subject: [PATCH 58/92] fix: Fix backwards incompatible import https://github.com/vega/altair/actions/runs/10566291662/job/29272723977?pr=3547 --- tests/utils/test_schemapi.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index af4af8559..2c4c11ed1 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -10,6 +10,7 @@ import warnings from collections import deque from functools import partial +from importlib.metadata import version as importlib_version from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Literal, Sequence import jsonschema @@ -18,6 +19,7 @@ import pandas as pd import polars as pl import pytest +from packaging.version import Version import altair as alt from altair import load_schema @@ -74,7 +76,12 @@ def test_actual_json_schema_draft_is_same_as_hardcoded_default(): def test_init_subclasses_hierarchy(dummy_rootschema) -> None: - from referencing.exceptions import Unresolvable + if Version(importlib_version("jsonschema")) >= Version("4.18"): + from referencing.exceptions import Unresolvable + else: + from jsonschema.exceptions import ( # type: ignore[assignment] + RefResolutionError as Unresolvable, + ) from altair.expr.core import GetItemExpression, OperatorMixin from altair.utils.schemapi import _SchemaBasePEP487 From 5fad0889285ddb521960af799276e91073de5a40 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 27 Aug 2024 21:44:02 +0100 Subject: [PATCH 59/92] ci: Fix include pattern --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 05aade59b..3e1046bb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -455,7 +455,7 @@ pythonVersion="3.8" reportUnusedExpression="none" include=[ "./altair/**/*.py", - ".doc/*.py", + "./doc/*.py", "./sphinxext/**/*.py", "./tests/**/*.py", "./tools/**/*.py", From 69d9f678b737607ae5a0e3f1e9e8bd0a20a47b99 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:30:19 +0100 Subject: [PATCH 60/92] test: Add `SchemaBase.__init_subclass__` benchmark results Locally, did a full replacement of class hierarchy - but saw no observable performance improvement --- tests/utils/test_schemapi.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 2c4c11ed1..cba7e2b2c 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -1071,6 +1071,11 @@ def test_chart_validation_benchmark( 107.84s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] 50.43s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] 67.07s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] + + # Post-`SchemaBase.__init_subclass__` addition (3.12.3) + 108.24s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] + 50.33s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] + 66.51s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] ``` """ from itertools import chain, repeat From 307cb9bf96341f607189f005c9d8026aa6f16de3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:42:01 +0100 Subject: [PATCH 61/92] refactor(typing): Widen internal `rootschema|schema` types from `dict[str, Any]` No functional or user-facing change. This will be to support using types other than `dict`, which isn't strictly required here --- tools/generate_schema_wrapper.py | 6 +++--- tools/schemapi/schemapi.py | 35 +++++++++++--------------------- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index f4b970902..62384f0a0 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -424,7 +424,7 @@ def _add_shorthand_property_to_field_encodings(schema: dict) -> dict: "description": "shorthand for field, aggregate, and type", } if "required" not in defschema: - defschema["required"] = ["shorthand"] + defschema["required"] = ["shorthand"] # type: ignore elif "shorthand" not in defschema["required"]: defschema["required"].append("shorthand") schema["definitions"][field_ref.split("/")[-1]] = defschema @@ -457,7 +457,7 @@ def recursive_dict_update(schema: dict, root: dict, def_dict: dict) -> None: if k in properties: def_dict[k] = definition else: - recursive_dict_update(next_schema, root, def_dict) + recursive_dict_update(next_schema, root, def_dict) # type: ignore elif "anyOf" in schema: for sub_schema in schema["anyOf"]: recursive_dict_update(sub_schema, root, def_dict) @@ -473,7 +473,7 @@ def get_field_datum_value_defs(propschema: SchemaInfo, root: dict) -> dict[str, msg = "Unexpected schema structure" raise ValueError(msg) else: - recursive_dict_update(schema, root, def_dict) + recursive_dict_update(schema, root, def_dict) # type: ignore return {i: j for i, j in def_dict.items() if j} diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 19326d81b..98590a938 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -141,9 +141,7 @@ def debug_mode(arg: bool) -> Iterator[None]: def validate_jsonschema( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, + spec: _JsonParameter, schema: Map, rootschema: Map | None = None ) -> None: """ Validates ``spec`` against ``schema`` in the context of ``rootschema``. @@ -191,7 +189,7 @@ def validate_jsonschema( raise NotImplementedError(msg) -def _get_schema_dialect_uri(schema: dict[str, Any]) -> str: +def _get_schema_dialect_uri(schema: Map, /) -> str: """ Return value of `$schema`_. @@ -204,7 +202,7 @@ def _get_schema_dialect_uri(schema: dict[str, Any]) -> str: return schema.get("$schema", _DEFAULT_DIALECT_URI) -def _prepare_references(schema: dict[str, Any], /) -> dict[str, Any]: +def _prepare_references(schema: Map, /) -> dict[str, Any]: """ Return a deep copy of ``schema`` w/ replaced uri(s). @@ -218,7 +216,7 @@ def _prepare_references(schema: dict[str, Any], /) -> dict[str, Any]: return dict(_rec_refs(schema)) -def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: +def _rec_refs(m: Map, /) -> Iterator[tuple[str, Any]]: """ Recurse through a schema, yielding fresh copies of mutable containers. @@ -285,9 +283,7 @@ def specification_with(dialect_id: str, /) -> Specification[Any]: """ return _specification_with(dialect_id) - def _validator( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None - ) -> Validator: + def _validator(schema: Map, rootschema: Map | None = None, /) -> Validator: """ Constructs a `Validator`_ for future validation. @@ -309,7 +305,7 @@ def _validator( registry = _registry(rootschema or schema, uri) return validator(_prepare_references(schema), registry=registry) - def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: + def _registry(rootschema: Map, dialect_id: str) -> Registry[Any]: """ Constructs a `Registry`_, adding the `Resource`_ produced by ``rootschema``. @@ -333,16 +329,12 @@ def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: _REGISTRY_CACHE[cache_key] = registry return registry - def _registry_update( - root: dict[str, Any], dialect_id: str, resolver: Resolver[Any] - ): + def _registry_update(root: Map, dialect_id: str, resolver: Resolver[Any]) -> None: global _REGISTRY_CACHE cache_key = _registry_comp_key(root, dialect_id) _REGISTRY_CACHE[cache_key] = resolver._registry - def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] - ) -> dict[str, Any]: + def _resolve_references(schema: Map, rootschema: Map) -> Map: """ Resolve schema references until there is no ``"$ref"`` anymore in the top-level ``dict``. @@ -391,9 +383,7 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: else: - def _validator( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None - ) -> Validator: + def _validator(schema: Map, rootschema: Map | None = None, /) -> Validator: """ Constructs a `Validator`_ for future validation. @@ -416,9 +406,7 @@ def _validator( ) return validator(schema, resolver=resolver) - def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] - ) -> dict[str, Any]: + def _resolve_references(schema: Map, rootschema: Map) -> Map: """ Resolve schema references until there is no ``"$ref"`` anymore in the top-level ``dict``. @@ -1444,7 +1432,8 @@ def resolve_references(cls, schema: dict[str, Any] | None = None) -> dict[str, A ) raise TypeError(msg) else: - return _resolve_references(schema or cls._schema, rootschema=rootschema) + resolved = _resolve_references(schema or cls._schema, rootschema) + return cast("dict[str, Any]", resolved) @classmethod def validate_property( From 8fa8975163cd45aa1c713b7fe027d3619a75fd99 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:48:21 +0100 Subject: [PATCH 62/92] perf: Adds `resolve_references_rpds` See docstring for notes and plan --- tools/schemapi/schemapi.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 98590a938..5eb962c0b 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -262,6 +262,7 @@ def _validator_for(uri: str, /) -> Callable[..., Validator]: if TYPE_CHECKING: from referencing import Specification from referencing._core import Resolver + from rpds import HashTrieMap @lru_cache(maxsize=None) def specification_with(dialect_id: str, /) -> Specification[Any]: @@ -379,6 +380,22 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: k1 = json.dumps(root, separators=(",", ":"), sort_keys=True) return k1, dialect_id + def resolve_references_rpds(schema: Map, rootschema: Map) -> HashTrieMap[str, Any]: + """ + **Experimental** `rust`-speed returned type. + + Directly wraps `_resolve_references`. + + Idea + ---- + - Store the result of this when called from ``_FromDict.from_dict()`` once per unique call + - Reuse the resolved schema, since we don't mutate it after resolving + - Should reduce the cost of ``_FromDict.from_dict()``, when a schema has been seen before + """ + import rpds as rpds + + return rpds.HashTrieMap(_resolve_references(schema, rootschema)) + _REGISTRY_CACHE: dict[tuple[str, str], Registry[Any]] = {} else: From 5e93933789ab51b8e1f625d768046b0bd15c8034 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:48:59 +0100 Subject: [PATCH 63/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 52 ++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 12314089e..94a67527f 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -143,9 +143,7 @@ def debug_mode(arg: bool) -> Iterator[None]: def validate_jsonschema( - spec: _JsonParameter, - schema: dict[str, Any], - rootschema: dict[str, Any] | None = None, + spec: _JsonParameter, schema: Map, rootschema: Map | None = None ) -> None: """ Validates ``spec`` against ``schema`` in the context of ``rootschema``. @@ -193,7 +191,7 @@ def validate_jsonschema( raise NotImplementedError(msg) -def _get_schema_dialect_uri(schema: dict[str, Any]) -> str: +def _get_schema_dialect_uri(schema: Map, /) -> str: """ Return value of `$schema`_. @@ -206,7 +204,7 @@ def _get_schema_dialect_uri(schema: dict[str, Any]) -> str: return schema.get("$schema", _DEFAULT_DIALECT_URI) -def _prepare_references(schema: dict[str, Any], /) -> dict[str, Any]: +def _prepare_references(schema: Map, /) -> dict[str, Any]: """ Return a deep copy of ``schema`` w/ replaced uri(s). @@ -220,7 +218,7 @@ def _prepare_references(schema: dict[str, Any], /) -> dict[str, Any]: return dict(_rec_refs(schema)) -def _rec_refs(m: dict[str, Any], /) -> Iterator[tuple[str, Any]]: +def _rec_refs(m: Map, /) -> Iterator[tuple[str, Any]]: """ Recurse through a schema, yielding fresh copies of mutable containers. @@ -266,6 +264,7 @@ def _validator_for(uri: str, /) -> Callable[..., Validator]: if TYPE_CHECKING: from referencing import Specification from referencing._core import Resolver + from rpds import HashTrieMap @lru_cache(maxsize=None) def specification_with(dialect_id: str, /) -> Specification[Any]: @@ -287,9 +286,7 @@ def specification_with(dialect_id: str, /) -> Specification[Any]: """ return _specification_with(dialect_id) - def _validator( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None - ) -> Validator: + def _validator(schema: Map, rootschema: Map | None = None, /) -> Validator: """ Constructs a `Validator`_ for future validation. @@ -311,7 +308,7 @@ def _validator( registry = _registry(rootschema or schema, uri) return validator(_prepare_references(schema), registry=registry) - def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: + def _registry(rootschema: Map, dialect_id: str) -> Registry[Any]: """ Constructs a `Registry`_, adding the `Resource`_ produced by ``rootschema``. @@ -335,16 +332,12 @@ def _registry(rootschema: dict[str, Any], dialect_id: str) -> Registry[Any]: _REGISTRY_CACHE[cache_key] = registry return registry - def _registry_update( - root: dict[str, Any], dialect_id: str, resolver: Resolver[Any] - ): + def _registry_update(root: Map, dialect_id: str, resolver: Resolver[Any]) -> None: global _REGISTRY_CACHE cache_key = _registry_comp_key(root, dialect_id) _REGISTRY_CACHE[cache_key] = resolver._registry - def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] - ) -> dict[str, Any]: + def _resolve_references(schema: Map, rootschema: Map) -> Map: """ Resolve schema references until there is no ``"$ref"`` anymore in the top-level ``dict``. @@ -389,13 +382,27 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: k1 = json.dumps(root, separators=(",", ":"), sort_keys=True) return k1, dialect_id + def resolve_references_rpds(schema: Map, rootschema: Map) -> HashTrieMap[str, Any]: + """ + **Experimental** `rust`-speed returned type. + + Directly wraps `_resolve_references`. + + Idea + ---- + - Store the result of this when called from ``_FromDict.from_dict()`` once per unique call + - Reuse the resolved schema, since we don't mutate it after resolving + - Should reduce the cost of ``_FromDict.from_dict()``, when a schema has been seen before + """ + import rpds as rpds + + return rpds.HashTrieMap(_resolve_references(schema, rootschema)) + _REGISTRY_CACHE: dict[tuple[str, str], Registry[Any]] = {} else: - def _validator( - schema: dict[str, Any], rootschema: dict[str, Any] | None = None - ) -> Validator: + def _validator(schema: Map, rootschema: Map | None = None, /) -> Validator: """ Constructs a `Validator`_ for future validation. @@ -418,9 +425,7 @@ def _validator( ) return validator(schema, resolver=resolver) - def _resolve_references( - schema: dict[str, Any], rootschema: dict[str, Any] - ) -> dict[str, Any]: + def _resolve_references(schema: Map, rootschema: Map) -> Map: """ Resolve schema references until there is no ``"$ref"`` anymore in the top-level ``dict``. @@ -1446,7 +1451,8 @@ def resolve_references(cls, schema: dict[str, Any] | None = None) -> dict[str, A ) raise TypeError(msg) else: - return _resolve_references(schema or cls._schema, rootschema=rootschema) + resolved = _resolve_references(schema or cls._schema, rootschema) + return cast("dict[str, Any]", resolved) @classmethod def validate_property( From c322e79a4df7bb3607105fc1b545b8037e6cc4b8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 29 Aug 2024 18:33:16 +0100 Subject: [PATCH 64/92] refactor: Rename `_rec_refs` -> `_recurse_refs` --- tools/schemapi/schemapi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 5eb962c0b..d259b63a7 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -213,10 +213,10 @@ def _prepare_references(schema: Map, /) -> dict[str, Any]: ----- ``copy.deepcopy`` is not needed as the iterator yields new objects. """ - return dict(_rec_refs(schema)) + return dict(_recurse_refs(schema)) -def _rec_refs(m: Map, /) -> Iterator[tuple[str, Any]]: +def _recurse_refs(m: Map, /) -> Iterator[tuple[str, Any]]: """ Recurse through a schema, yielding fresh copies of mutable containers. @@ -226,9 +226,9 @@ def _rec_refs(m: Map, /) -> Iterator[tuple[str, Any]]: if k == "$ref": yield k, f"{_VEGA_LITE_ROOT_URI}{v}" elif isinstance(v, dict): - yield k, dict(_rec_refs(v)) + yield k, dict(_recurse_refs(v)) elif isinstance(v, list): - yield k, [dict(_rec_refs(el)) if _is_dict(el) else el for el in v] + yield k, [dict(_recurse_refs(el)) if _is_dict(el) else el for el in v] else: yield k, v From 96eed9bfe17d9ef9bdf0aa4d0e35f6907e9c0212 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 29 Aug 2024 21:01:16 +0100 Subject: [PATCH 65/92] perf: Adds `_FromDict.hash_resolved` --- tools/schemapi/schemapi.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index d259b63a7..8e9e0a7f1 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1597,6 +1597,13 @@ class _FromDict: https://en.wikipedia.org/wiki/Breadth-first_search """ + hash_resolved: ClassVar[dict[int, Map]] = {} + """ + Maps unique schemas to their reference-resolved equivalent. + + Ensures that ``_resolve_references`` is evaluated **at most once**, per hash. + """ + def __init__(self, wrapper_classes: Iterator[type[SchemaBase]], /) -> None: cls = type(self) for tp in wrapper_classes: @@ -1665,24 +1672,30 @@ def from_dict( """Construct an object from a dict representation.""" target_tp: Any current_schema: dict[str, Any] + hash_schema: int if isinstance(dct, SchemaBase): return dct elif tp is not None: current_schema = tp._schema + hash_schema = _hash_schema(current_schema) root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: current_schema = schema + hash_schema = _hash_schema(current_schema) root_schema = rootschema or current_schema - matches = cls.hash_tps[_hash_schema(current_schema)] + matches = cls.hash_tps[hash_schema] target_tp = next(iter(matches), default_class) else: msg = "Must provide either `tp` or `schema`, but not both." raise ValueError(msg) from_dict = partial(cls.from_dict, rootschema=root_schema) - # Can also return a list? - resolved = _resolve_references(current_schema, root_schema) + if resolved := cls.hash_resolved.get(hash_schema): + ... + else: + resolved = _resolve_references(current_schema, root_schema) + cls.hash_resolved[hash_schema] = resolved if "anyOf" in resolved or "oneOf" in resolved: schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) for possible in schemas: From 3258f5681020374a9f8e45c29c3962f24b4a9191 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 29 Aug 2024 21:05:02 +0100 Subject: [PATCH 66/92] perf: Remove unreachable `"oneOf"` keyword check Since `v2` this keyword has not been part of any schema. It has only been a property of what became `FieldOneOfPredicate` https://github.com/vega/schema/blob/ef61166f3f95154465c4b3ebdca88e3c2d25b005/vega-lite/v2.0.0.json#L4060 --- tools/schemapi/schemapi.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 8e9e0a7f1..9be14e299 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1696,9 +1696,8 @@ def from_dict( else: resolved = _resolve_references(current_schema, root_schema) cls.hash_resolved[hash_schema] = resolved - if "anyOf" in resolved or "oneOf" in resolved: - schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) - for possible in schemas: + if "anyOf" in resolved: + for possible in resolved["anyOf"]: # NOTE: Instead of raise/except/continue # Pre-"zero-cost" exceptions, this has a huge performance gain. # https://docs.python.org/3/whatsnew/3.11.html#misc From d6ce7497d00a21185d079d069d9e530e1795d959 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 29 Aug 2024 21:06:47 +0100 Subject: [PATCH 67/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 94a67527f..1ad304b52 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -215,10 +215,10 @@ def _prepare_references(schema: Map, /) -> dict[str, Any]: ----- ``copy.deepcopy`` is not needed as the iterator yields new objects. """ - return dict(_rec_refs(schema)) + return dict(_recurse_refs(schema)) -def _rec_refs(m: Map, /) -> Iterator[tuple[str, Any]]: +def _recurse_refs(m: Map, /) -> Iterator[tuple[str, Any]]: """ Recurse through a schema, yielding fresh copies of mutable containers. @@ -228,9 +228,9 @@ def _rec_refs(m: Map, /) -> Iterator[tuple[str, Any]]: if k == "$ref": yield k, f"{_VEGA_LITE_ROOT_URI}{v}" elif isinstance(v, dict): - yield k, dict(_rec_refs(v)) + yield k, dict(_recurse_refs(v)) elif isinstance(v, list): - yield k, [dict(_rec_refs(el)) if _is_dict(el) else el for el in v] + yield k, [dict(_recurse_refs(el)) if _is_dict(el) else el for el in v] else: yield k, v @@ -1599,6 +1599,13 @@ class _FromDict: https://en.wikipedia.org/wiki/Breadth-first_search """ + hash_resolved: ClassVar[dict[int, Map]] = {} + """ + Maps unique schemas to their reference-resolved equivalent. + + Ensures that ``_resolve_references`` is evaluated **at most once**, per hash. + """ + def __init__(self, wrapper_classes: Iterator[type[SchemaBase]], /) -> None: cls = type(self) for tp in wrapper_classes: @@ -1667,27 +1674,32 @@ def from_dict( """Construct an object from a dict representation.""" target_tp: Any current_schema: dict[str, Any] + hash_schema: int if isinstance(dct, SchemaBase): return dct elif tp is not None: current_schema = tp._schema + hash_schema = _hash_schema(current_schema) root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: current_schema = schema + hash_schema = _hash_schema(current_schema) root_schema = rootschema or current_schema - matches = cls.hash_tps[_hash_schema(current_schema)] + matches = cls.hash_tps[hash_schema] target_tp = next(iter(matches), default_class) else: msg = "Must provide either `tp` or `schema`, but not both." raise ValueError(msg) from_dict = partial(cls.from_dict, rootschema=root_schema) - # Can also return a list? - resolved = _resolve_references(current_schema, root_schema) - if "anyOf" in resolved or "oneOf" in resolved: - schemas = resolved.get("anyOf", []) + resolved.get("oneOf", []) - for possible in schemas: + if resolved := cls.hash_resolved.get(hash_schema): + ... + else: + resolved = _resolve_references(current_schema, root_schema) + cls.hash_resolved[hash_schema] = resolved + if "anyOf" in resolved: + for possible in resolved["anyOf"]: # NOTE: Instead of raise/except/continue # Pre-"zero-cost" exceptions, this has a huge performance gain. # https://docs.python.org/3/whatsnew/3.11.html#misc From 89fcaf7390d326f691d49ab06d209be75f5bd085 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:03:55 +0100 Subject: [PATCH 68/92] perf: Optimize `dict` branch in `_FromDict.from_dict` - Bypass `dct.items()` when there are no properties -Reuse result of `props` lookup, rather than twice per hit --- tools/schemapi/schemapi.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 9be14e299..a23cdc082 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1661,7 +1661,7 @@ def from_dict( default_class: Any = ..., ) -> Never: ... @classmethod - def from_dict( + def from_dict( # noqa: C901 cls, dct: dict[str, Any] | list[dict[str, Any]] | TSchemaBase, tp: type[TSchemaBase] | None = None, @@ -1708,12 +1708,14 @@ def from_dict( if _is_dict(dct): # TODO: handle schemas for additionalProperties/patternProperties - props: dict[str, Any] = resolved.get("properties", {}) - kwds = { - k: (from_dict(v, schema=props[k]) if k in props else v) - for k, v in dct.items() - } - return target_tp(**kwds) + if props := resolved.get("properties"): + kwds = { + k: (from_dict(v, schema=sch) if (sch := props.get(k)) else v) + for k, v in dct.items() + } + return target_tp(**kwds) + else: + return target_tp(**dct) elif _is_list(dct): item_schema: dict[str, Any] = resolved.get("items", {}) return target_tp([from_dict(k, schema=item_schema) for k in dct]) From c9f9d8aae2b52b75555b70e84895c7edc291a42b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:55:24 +0100 Subject: [PATCH 69/92] docs(perf): Adds benchmark result Very minor improvement --- tests/utils/test_schemapi.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index cba7e2b2c..1fce07009 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -1076,6 +1076,11 @@ def test_chart_validation_benchmark( 108.24s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] 50.33s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] 66.51s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] + + # Post-`dict` branch micro optimization in `_FromDict.from_dict` (3.12.3) + 107.90s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] + 49.63s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] + 66.87s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] ``` """ from itertools import chain, repeat From e9c8f857e2eb581540838afd877128b012510baa Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:56:12 +0100 Subject: [PATCH 70/92] chore(perf): Add note on next refactor candidate --- tools/schemapi/schemapi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index a23cdc082..25ca1ec49 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -213,6 +213,8 @@ def _prepare_references(schema: Map, /) -> dict[str, Any]: ----- ``copy.deepcopy`` is not needed as the iterator yields new objects. """ + # FIXME: The hottest function + it is recursive + # Should be done once per schema return dict(_recurse_refs(schema)) From 759a55641071f3aa0548e3d8775e4e3323672581 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:58:09 +0100 Subject: [PATCH 71/92] test: Add note on possibly outdated tests If the current schema doesn't trigger branches, but the test schema(s) do - it may indicate the tests need to be updated to better reflect actual --- tools/schemapi/schemapi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 25ca1ec49..0fdbf4c7f 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1680,6 +1680,8 @@ def from_dict( # noqa: C901 elif tp is not None: current_schema = tp._schema hash_schema = _hash_schema(current_schema) + # NOTE: the `current_schema` branch only triggered for mock schema tests: + # test_schemapi.py::[test_construct_multifaceted_schema, test_copy_method, test_round_trip, test_copy_module, test_from_dict, test_to_from_json, test_to_from_pickle] root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: From dace4883d50e8a1d782acd85985271f654d8842d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:58:30 +0100 Subject: [PATCH 72/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 1ad304b52..f334a5397 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -215,6 +215,8 @@ def _prepare_references(schema: Map, /) -> dict[str, Any]: ----- ``copy.deepcopy`` is not needed as the iterator yields new objects. """ + # FIXME: The hottest function + it is recursive + # Should be done once per schema return dict(_recurse_refs(schema)) @@ -1663,7 +1665,7 @@ def from_dict( default_class: Any = ..., ) -> Never: ... @classmethod - def from_dict( + def from_dict( # noqa: C901 cls, dct: dict[str, Any] | list[dict[str, Any]] | TSchemaBase, tp: type[TSchemaBase] | None = None, @@ -1680,6 +1682,8 @@ def from_dict( elif tp is not None: current_schema = tp._schema hash_schema = _hash_schema(current_schema) + # NOTE: the `current_schema` branch only triggered for mock schema tests: + # test_schemapi.py::[test_construct_multifaceted_schema, test_copy_method, test_round_trip, test_copy_module, test_from_dict, test_to_from_json, test_to_from_pickle] root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: @@ -1710,12 +1714,14 @@ def from_dict( if _is_dict(dct): # TODO: handle schemas for additionalProperties/patternProperties - props: dict[str, Any] = resolved.get("properties", {}) - kwds = { - k: (from_dict(v, schema=props[k]) if k in props else v) - for k, v in dct.items() - } - return target_tp(**kwds) + if props := resolved.get("properties"): + kwds = { + k: (from_dict(v, schema=sch) if (sch := props.get(k)) else v) + for k, v in dct.items() + } + return target_tp(**kwds) + else: + return target_tp(**dct) elif _is_list(dct): item_schema: dict[str, Any] = resolved.get("items", {}) return target_tp([from_dict(k, schema=item_schema) for k in dct]) From 531aa7c51de5541e4d3f97d3622a494471f8db32 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:20:26 +0100 Subject: [PATCH 73/92] chore(perf): Add FIXME on recursive source --- tools/schemapi/schemapi.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 0fdbf4c7f..fc0f70357 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1685,6 +1685,9 @@ def from_dict( # noqa: C901 root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: + # FIXME: This is the slow branch + # - Improving the perf of the `tp` one is too small scale + # - Every recursive `from_dict` call that isn't solved hits this current_schema = schema hash_schema = _hash_schema(current_schema) root_schema = rootschema or current_schema From 3277b432c72f0dfc77e8b234c37bbe7ebd4f22e8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 19:04:39 +0100 Subject: [PATCH 74/92] refactor(perf): Crawl registry, init `Resolver` with `_VEGA_LITE_ROOT_URI` --- tools/schemapi/schemapi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index fc0f70357..c803063f5 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -328,7 +328,7 @@ def _registry(rootschema: Map, dialect_id: str) -> Registry[Any]: else: specification = specification_with(dialect_id) resource = specification.create_resource(rootschema) - registry = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource) + registry = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource).crawl() _REGISTRY_CACHE[cache_key] = registry return registry @@ -354,9 +354,9 @@ def _resolve_references(schema: Map, rootschema: Map) -> Map: return schema uri = _get_schema_dialect_uri(rootschema) registry = _registry(root, uri) - resolver = registry.resolver() + resolver = registry.resolver(_VEGA_LITE_ROOT_URI) while "$ref" in schema: - resolved = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]) + resolved = resolver.lookup(schema["$ref"]) schema = resolved.contents _registry_update(root, uri, resolved.resolver) return schema @@ -394,7 +394,7 @@ def resolve_references_rpds(schema: Map, rootschema: Map) -> HashTrieMap[str, An - Reuse the resolved schema, since we don't mutate it after resolving - Should reduce the cost of ``_FromDict.from_dict()``, when a schema has been seen before """ - import rpds as rpds + import rpds return rpds.HashTrieMap(_resolve_references(schema, rootschema)) From 96dde1832a507d415bd2a40f250e4f45bba78c6d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 19:06:23 +0100 Subject: [PATCH 75/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index f334a5397..492eb78cb 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -330,7 +330,7 @@ def _registry(rootschema: Map, dialect_id: str) -> Registry[Any]: else: specification = specification_with(dialect_id) resource = specification.create_resource(rootschema) - registry = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource) + registry = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource).crawl() _REGISTRY_CACHE[cache_key] = registry return registry @@ -356,9 +356,9 @@ def _resolve_references(schema: Map, rootschema: Map) -> Map: return schema uri = _get_schema_dialect_uri(rootschema) registry = _registry(root, uri) - resolver = registry.resolver() + resolver = registry.resolver(_VEGA_LITE_ROOT_URI) while "$ref" in schema: - resolved = resolver.lookup(_VEGA_LITE_ROOT_URI + schema["$ref"]) + resolved = resolver.lookup(schema["$ref"]) schema = resolved.contents _registry_update(root, uri, resolved.resolver) return schema @@ -396,7 +396,7 @@ def resolve_references_rpds(schema: Map, rootschema: Map) -> HashTrieMap[str, An - Reuse the resolved schema, since we don't mutate it after resolving - Should reduce the cost of ``_FromDict.from_dict()``, when a schema has been seen before """ - import rpds as rpds + import rpds return rpds.HashTrieMap(_resolve_references(schema, rootschema)) @@ -1687,6 +1687,9 @@ def from_dict( # noqa: C901 root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: + # FIXME: This is the slow branch + # - Improving the perf of the `tp` one is too small scale + # - Every recursive `from_dict` call that isn't solved hits this current_schema = schema hash_schema = _hash_schema(current_schema) root_schema = rootschema or current_schema From ab20005a58b7dfcc92d202c7220a8e1ff5abc6da Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 30 Aug 2024 20:02:44 +0100 Subject: [PATCH 76/92] refactor: Reuse `JSONEncoder` for hashing --- altair/utils/schemapi.py | 6 ++++-- tools/schemapi/schemapi.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 492eb78cb..22599a4f0 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -259,6 +259,8 @@ def _validator_for(uri: str, /) -> Callable[..., Validator]: return tp +_HASH_ENCODER = json.JSONEncoder(sort_keys=True, separators=(",", ":")) + if Version(importlib_version("jsonschema")) >= Version("4.18"): from referencing import Registry from referencing.jsonschema import specification_with as _specification_with @@ -381,7 +383,7 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: elif len(root) == 1: k1 = "".join(f"{s!s}" for s in chain(*root.items())) else: - k1 = json.dumps(root, separators=(",", ":"), sort_keys=True) + k1 = _HASH_ENCODER.encode(root) return k1, dialect_id def resolve_references_rpds(schema: Map, rootschema: Map) -> HashTrieMap[str, Any]: @@ -1539,7 +1541,7 @@ def _hash_schema( """ if isinstance(schema, Mapping): schema = {k: v for k, v in schema.items() if k not in exclude} - return hash(json.dumps(schema, sort_keys=True)) + return hash(_HASH_ENCODER.encode(schema)) def _subclasses(cls: type[TSchemaBase]) -> Iterator[type[TSchemaBase]]: diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index c803063f5..939c07c46 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -257,6 +257,8 @@ def _validator_for(uri: str, /) -> Callable[..., Validator]: return tp +_HASH_ENCODER = json.JSONEncoder(sort_keys=True, separators=(",", ":")) + if Version(importlib_version("jsonschema")) >= Version("4.18"): from referencing import Registry from referencing.jsonschema import specification_with as _specification_with @@ -379,7 +381,7 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: elif len(root) == 1: k1 = "".join(f"{s!s}" for s in chain(*root.items())) else: - k1 = json.dumps(root, separators=(",", ":"), sort_keys=True) + k1 = _HASH_ENCODER.encode(root) return k1, dialect_id def resolve_references_rpds(schema: Map, rootschema: Map) -> HashTrieMap[str, Any]: @@ -1537,7 +1539,7 @@ def _hash_schema( """ if isinstance(schema, Mapping): schema = {k: v for k, v in schema.items() if k not in exclude} - return hash(json.dumps(schema, sort_keys=True)) + return hash(_HASH_ENCODER.encode(schema)) def _subclasses(cls: type[TSchemaBase]) -> Iterator[type[TSchemaBase]]: From a419ede8cda4a53a2919c8e312780b0980884cf9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 31 Aug 2024 20:05:16 +0100 Subject: [PATCH 77/92] perf: Avoid exception handling in `Chart.from_dict` --- altair/utils/schemapi.py | 11 +++++++++++ altair/vegalite/v5/api.py | 11 +++++------ tools/schemapi/schemapi.py | 11 +++++++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 14f88889b..e7a3dd88b 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1530,6 +1530,17 @@ def _is_iterable( return not isinstance(obj, exclude) and isinstance(obj, Iterable) +def _is_valid(spec: _JsonParameter, tp: type[SchemaBase], /) -> bool: + """ + Return True if ``tp`` can be constructed from ``spec``. + + Notes + ----- + Don't use this if you need to know *details* of the errors in ``spec``.. + """ + return next(_validator(tp._schema, tp._rootschema).iter_errors(spec), None) is None + + def _passthrough(*args: Any, **kwds: Any) -> Any | dict[str, Any]: return args[0] if args else kwds diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index a2940ecb8..c9d9d8b09 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -22,7 +22,6 @@ ) from typing_extensions import TypeAlias -import jsonschema import narwhals.stable.v1 as nw from altair import utils @@ -37,6 +36,7 @@ ) from altair.utils.data import DataType from altair.utils.data import is_data_type as _is_data_type +from altair.utils.schemapi import _is_valid from .compiler import vegalite_compilers from .data import data_transformers @@ -3724,14 +3724,13 @@ def from_dict( jsonschema.ValidationError : If ``validate`` and ``dct`` does not conform to the schema """ + if not validate: + return super().from_dict(dct, validate=False) _tp: Any for tp in TopLevelMixin.__subclasses__(): _tp = super() if tp is Chart else tp - # FIXME: Hot try/except - try: - return _tp.from_dict(dct, validate=validate) - except jsonschema.ValidationError: - pass + if _is_valid(dct, _tp): + return _tp.from_dict(dct, validate=False) # As a last resort, try using the Root vegalite object return t.cast(_TSchemaBase, core.Root.from_dict(dct, validate)) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 5cc697e07..dea8805e5 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1528,6 +1528,17 @@ def _is_iterable( return not isinstance(obj, exclude) and isinstance(obj, Iterable) +def _is_valid(spec: _JsonParameter, tp: type[SchemaBase], /) -> bool: + """ + Return True if ``tp`` can be constructed from ``spec``. + + Notes + ----- + Don't use this if you need to know *details* of the errors in ``spec``.. + """ + return next(_validator(tp._schema, tp._rootschema).iter_errors(spec), None) is None + + def _passthrough(*args: Any, **kwds: Any) -> Any | dict[str, Any]: return args[0] if args else kwds From 4a7cc4e32433452499fcabeef64a671736968a88 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 1 Sep 2024 13:21:17 +0100 Subject: [PATCH 78/92] chore(perf): Remove `# FIXME`(s) that had no effect --- altair/utils/core.py | 1 - altair/vegalite/v5/api.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/altair/utils/core.py b/altair/utils/core.py index a1d81c39e..8df22b154 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -830,7 +830,6 @@ def from_channels(cls, channels: ModuleType, /) -> _ChannelCache: @classmethod def from_cache(cls) -> _ChannelCache: global _CHANNEL_CACHE - # FIXME: Hot try/except try: cached = _CHANNEL_CACHE except NameError: diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index c9d9d8b09..ae7428960 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -1803,7 +1803,6 @@ def to_dict( # noqa: C901 copy = _top_schema_base(self).copy(deep=False) original_data = getattr(copy, "data", Undefined) if not utils.is_undefined(original_data): - # FIXME: Hot try/except try: data = _to_eager_narwhals_dataframe(original_data) except TypeError: @@ -3419,7 +3418,6 @@ def _repr_mimebundle_(self, *args, **kwds) -> MimeBundleType | None: # type:ign """Return a MIME bundle for display in Jupyter frontends.""" # Catch errors explicitly to get around issues in Jupyter frontend # see https://github.com/ipython/ipython/issues/11038 - # FIXME: Hot try/except try: dct = self.to_dict(context={"pre_transform": False}) except Exception: From 9bc3ccd16bc648d678ac0d8b5ce6afdc48dd737a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 1 Sep 2024 13:23:05 +0100 Subject: [PATCH 79/92] chore: Remove `# FIXME`(s) that won't be addressed Maybe revisit lazy imports in the future, but for now this is too unrelated to the PR --- altair/utils/_importers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/altair/utils/_importers.py b/altair/utils/_importers.py index 93e647f33..14085ebcf 100644 --- a/altair/utils/_importers.py +++ b/altair/utils/_importers.py @@ -76,7 +76,6 @@ def vl_version_for_vl_convert() -> str: def import_pyarrow_interchange() -> ModuleType: min_version = "11.0.0" - # FIXME: Hot try/except try: version = importlib_version("pyarrow") @@ -103,7 +102,6 @@ def import_pyarrow_interchange() -> ModuleType: def pyarrow_available() -> bool: - # FIXME: Hot try/except try: import_pyarrow_interchange() return True From af480f55a9e67e7f1e4f8e08be9a6e0fb166b2b3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:01:38 +0100 Subject: [PATCH 80/92] refactor: Remove unused `_freeze` See 8ca426675379f4e65d025075c81bc099c6cdadb3 --- tools/schemapi/schemapi.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index dea8805e5..09c23d4ff 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -91,7 +91,8 @@ This URI is arbitrary and could be anything else. It just cannot be an empty string as we need to reference the schema registered in -the ``referencing.Registry``.""" +the ``referencing.Registry``. +""" _DEFAULT_DIALECT_URI: LiteralString = "http://json-schema.org/draft-07/schema#" """ @@ -1543,20 +1544,6 @@ def _passthrough(*args: Any, **kwds: Any) -> Any | dict[str, Any]: return args[0] if args else kwds -def _freeze(val): - # NOTE: No longer referenced - # - Previously only called during tests - # - Not during any library code - if isinstance(val, dict): - return frozenset((k, _freeze(v)) for k, v in val.items()) - elif isinstance(val, set): - return frozenset(_freeze(v) for v in val) - elif isinstance(val, (list, tuple)): - return tuple(_freeze(v) for v in val) - else: - return val - - def _hash_schema( schema: _JsonParameter, /, From dff817ba604270de6acf6ceb6904d4df83ebe839 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:21:00 +0100 Subject: [PATCH 81/92] chore: Remove comments Moving to discussion threads on the PR --- tools/schemapi/schemapi.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 09c23d4ff..5fd4c1385 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1366,8 +1366,6 @@ def from_dict( """ if validate: cls.validate(dct) - # NOTE: the breadth-first search occurs only once now - # `_FromDict` is purely ClassVar/classmethods converter: type[_FromDict] | _FromDict = ( _FromDict if _FromDict.hash_tps @@ -1707,14 +1705,9 @@ def from_dict( # noqa: C901 elif tp is not None: current_schema = tp._schema hash_schema = _hash_schema(current_schema) - # NOTE: the `current_schema` branch only triggered for mock schema tests: - # test_schemapi.py::[test_construct_multifaceted_schema, test_copy_method, test_round_trip, test_copy_module, test_from_dict, test_to_from_json, test_to_from_pickle] root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: - # FIXME: This is the slow branch - # - Improving the perf of the `tp` one is too small scale - # - Every recursive `from_dict` call that isn't solved hits this current_schema = schema hash_schema = _hash_schema(current_schema) root_schema = rootschema or current_schema From 133aa01bd527555942446e719974d334e9cb5cda Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:22:19 +0100 Subject: [PATCH 82/92] build: run `generate-schema-wrapper` --- altair/utils/schemapi.py | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index e7a3dd88b..8a9876a53 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -93,7 +93,8 @@ This URI is arbitrary and could be anything else. It just cannot be an empty string as we need to reference the schema registered in -the ``referencing.Registry``.""" +the ``referencing.Registry``. +""" _DEFAULT_DIALECT_URI: LiteralString = "http://json-schema.org/draft-07/schema#" """ @@ -1367,8 +1368,6 @@ def from_dict( """ if validate: cls.validate(dct) - # NOTE: the breadth-first search occurs only once now - # `_FromDict` is purely ClassVar/classmethods converter: type[_FromDict] | _FromDict = ( _FromDict if _FromDict.hash_tps @@ -1545,20 +1544,6 @@ def _passthrough(*args: Any, **kwds: Any) -> Any | dict[str, Any]: return args[0] if args else kwds -def _freeze(val): - # NOTE: No longer referenced - # - Previously only called during tests - # - Not during any library code - if isinstance(val, dict): - return frozenset((k, _freeze(v)) for k, v in val.items()) - elif isinstance(val, set): - return frozenset(_freeze(v) for v in val) - elif isinstance(val, (list, tuple)): - return tuple(_freeze(v) for v in val) - else: - return val - - def _hash_schema( schema: _JsonParameter, /, @@ -1722,14 +1707,9 @@ def from_dict( # noqa: C901 elif tp is not None: current_schema = tp._schema hash_schema = _hash_schema(current_schema) - # NOTE: the `current_schema` branch only triggered for mock schema tests: - # test_schemapi.py::[test_construct_multifaceted_schema, test_copy_method, test_round_trip, test_copy_module, test_from_dict, test_to_from_json, test_to_from_pickle] root_schema: dict[str, Any] = rootschema or tp._rootschema or current_schema target_tp = tp elif schema is not None: - # FIXME: This is the slow branch - # - Improving the perf of the `tp` one is too small scale - # - Every recursive `from_dict` call that isn't solved hits this current_schema = schema hash_schema = _hash_schema(current_schema) root_schema = rootschema or current_schema From 141c8d14ee4fc07392c7a0ac89083404d4381d4e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:45:51 +0100 Subject: [PATCH 83/92] refactor: Collapse `...` following `:=` Not sure why I wrote this with `...` originally. --- altair/utils/schemapi.py | 4 +--- tools/schemapi/schemapi.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 8a9876a53..7e790273d 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1720,9 +1720,7 @@ def from_dict( # noqa: C901 raise ValueError(msg) from_dict = partial(cls.from_dict, rootschema=root_schema) - if resolved := cls.hash_resolved.get(hash_schema): - ... - else: + if (resolved := cls.hash_resolved.get(hash_schema)) is None: resolved = _resolve_references(current_schema, root_schema) cls.hash_resolved[hash_schema] = resolved if "anyOf" in resolved: diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 5fd4c1385..d1e43010f 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1718,9 +1718,7 @@ def from_dict( # noqa: C901 raise ValueError(msg) from_dict = partial(cls.from_dict, rootschema=root_schema) - if resolved := cls.hash_resolved.get(hash_schema): - ... - else: + if (resolved := cls.hash_resolved.get(hash_schema)) is None: resolved = _resolve_references(current_schema, root_schema) cls.hash_resolved[hash_schema] = resolved if "anyOf" in resolved: From fba3c46c0aa69a99d731c6a39a90ce396eae8863 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Sep 2024 17:18:38 +0100 Subject: [PATCH 84/92] chore: Remove temp notes from `_subclasses` --- altair/utils/schemapi.py | 26 +------------------------- tools/schemapi/schemapi.py | 26 +------------------------- 2 files changed, 2 insertions(+), 50 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 7e790273d..027e81556 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1568,31 +1568,7 @@ def _hash_schema( def _subclasses(cls: type[TSchemaBase]) -> Iterator[type[TSchemaBase]]: - """ - Breadth-first sequence of all classes which inherit from ``cls``. - - Notes - ----- - - `__subclasses__()` alone isn't helpful, as that is only immediate subclasses - - Deterministic - - Used for `SchemaBase` & `VegaLiteSchema` - - In practice, it provides an iterator over all classes in the schema below `VegaLiteSchema` - - The first one is `Root` - - The order itself, I don't think is important - - But probably important that it doesn't change - - Thinking they used an iterator so that the subclasses are evaluated after they have all been defined - - - `Chart` seems to try to avoid calling this - - Using `TopLevelMixin.__subclasses__()` first if possible - - It is always called during `Chart.encode()` - - Chart.encode() - - altair.utils.core.infer_encoding_types - - _ChannelCache.infer_encoding_types - - _ChannelCache._wrap_in_channel - - SchemaBase.from_dict (recursive, hot loop, validate =False, within a try/except) - - _FromDict(cls._default_wrapper_classes()) - - schemapi._subclasses(schema.core.VegaLiteSchema) - """ + """Breadth-first sequence of all classes which inherit from ``cls``.""" seen = set() current: set[type[TSchemaBase]] = {cls} while current: diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index d1e43010f..e309b084c 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1566,31 +1566,7 @@ def _hash_schema( def _subclasses(cls: type[TSchemaBase]) -> Iterator[type[TSchemaBase]]: - """ - Breadth-first sequence of all classes which inherit from ``cls``. - - Notes - ----- - - `__subclasses__()` alone isn't helpful, as that is only immediate subclasses - - Deterministic - - Used for `SchemaBase` & `VegaLiteSchema` - - In practice, it provides an iterator over all classes in the schema below `VegaLiteSchema` - - The first one is `Root` - - The order itself, I don't think is important - - But probably important that it doesn't change - - Thinking they used an iterator so that the subclasses are evaluated after they have all been defined - - - `Chart` seems to try to avoid calling this - - Using `TopLevelMixin.__subclasses__()` first if possible - - It is always called during `Chart.encode()` - - Chart.encode() - - altair.utils.core.infer_encoding_types - - _ChannelCache.infer_encoding_types - - _ChannelCache._wrap_in_channel - - SchemaBase.from_dict (recursive, hot loop, validate =False, within a try/except) - - _FromDict(cls._default_wrapper_classes()) - - schemapi._subclasses(schema.core.VegaLiteSchema) - """ + """Breadth-first sequence of all classes which inherit from ``cls``.""" seen = set() current: set[type[TSchemaBase]] = {cls} while current: From 44f52274696019c8706ba24488bee9f924d488d1 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Sep 2024 17:22:38 +0100 Subject: [PATCH 85/92] chore: Remove outdated TODO --- altair/utils/schemapi.py | 2 -- tools/schemapi/schemapi.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 027e81556..14effd02b 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1143,8 +1143,6 @@ class SchemaBase: the _rootschema class attribute) which is used for validation. """ - # TODO: Implement `ClassVar` validation using https://peps.python.org/pep-0487/ - _schema: ClassVar[dict[str, Any] | Any] = None _rootschema: ClassVar[dict[str, Any] | Any] = None _class_is_valid_at_instantiation: ClassVar[bool] = True diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index e309b084c..f1f157901 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1141,8 +1141,6 @@ class SchemaBase: the _rootschema class attribute) which is used for validation. """ - # TODO: Implement `ClassVar` validation using https://peps.python.org/pep-0487/ - _schema: ClassVar[dict[str, Any] | Any] = None _rootschema: ClassVar[dict[str, Any] | Any] = None _class_is_valid_at_instantiation: ClassVar[bool] = True From 5d9fb651eff416b682a5d84bcbc30ccad611f45a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Sep 2024 17:28:32 +0100 Subject: [PATCH 86/92] refactor: Remove unused `resolve_references_rpds` Wasn't able to demonstrate a performance improvement --- altair/utils/schemapi.py | 17 ----------------- tools/schemapi/schemapi.py | 17 ----------------- 2 files changed, 34 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 14effd02b..3ce87b92e 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -271,7 +271,6 @@ def _validator_for(uri: str, /) -> Callable[..., Validator]: if TYPE_CHECKING: from referencing import Specification from referencing._core import Resolver - from rpds import HashTrieMap @lru_cache(maxsize=None) def specification_with(dialect_id: str, /) -> Specification[Any]: @@ -389,22 +388,6 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: k1 = _HASH_ENCODER.encode(root) return k1, dialect_id - def resolve_references_rpds(schema: Map, rootschema: Map) -> HashTrieMap[str, Any]: - """ - **Experimental** `rust`-speed returned type. - - Directly wraps `_resolve_references`. - - Idea - ---- - - Store the result of this when called from ``_FromDict.from_dict()`` once per unique call - - Reuse the resolved schema, since we don't mutate it after resolving - - Should reduce the cost of ``_FromDict.from_dict()``, when a schema has been seen before - """ - import rpds - - return rpds.HashTrieMap(_resolve_references(schema, rootschema)) - _REGISTRY_CACHE: dict[tuple[str, str], Registry[Any]] = {} else: diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index f1f157901..483fb88e5 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -269,7 +269,6 @@ def _validator_for(uri: str, /) -> Callable[..., Validator]: if TYPE_CHECKING: from referencing import Specification from referencing._core import Resolver - from rpds import HashTrieMap @lru_cache(maxsize=None) def specification_with(dialect_id: str, /) -> Specification[Any]: @@ -387,22 +386,6 @@ def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: k1 = _HASH_ENCODER.encode(root) return k1, dialect_id - def resolve_references_rpds(schema: Map, rootschema: Map) -> HashTrieMap[str, Any]: - """ - **Experimental** `rust`-speed returned type. - - Directly wraps `_resolve_references`. - - Idea - ---- - - Store the result of this when called from ``_FromDict.from_dict()`` once per unique call - - Reuse the resolved schema, since we don't mutate it after resolving - - Should reduce the cost of ``_FromDict.from_dict()``, when a schema has been seen before - """ - import rpds - - return rpds.HashTrieMap(_resolve_references(schema, rootschema)) - _REGISTRY_CACHE: dict[tuple[str, str], Registry[Any]] = {} else: From a26b3f16a3a947227c0b86742b25bdcc281686cf Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Sep 2024 18:34:08 +0100 Subject: [PATCH 87/92] refactor: Collect functions, global into `_Registry` --- altair/utils/schemapi.py | 129 +++++++++++++++++++++---------------- tools/schemapi/schemapi.py | 129 +++++++++++++++++++++---------------- 2 files changed, 146 insertions(+), 112 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 3ce87b92e..6f05ff19f 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -292,6 +292,74 @@ def specification_with(dialect_id: str, /) -> Specification[Any]: """ return _specification_with(dialect_id) + class _Registry: + """ + A cache of `Registry`_ (s). + + An instance named ``registry`` is used to wrap the `Registry`_ API, + with a managed cache. + + See Also + -------- + ``_Registry.__call__`` + + .. _Registry: + https://referencing.readthedocs.io/en/stable/api/#referencing.Registry + """ + + _cached: ClassVar[dict[tuple[str, str], Registry[Any]]] = {} + + @staticmethod + def compute_key(root: Map, dialect_id: str, /) -> tuple[str, str]: + """ + Generate a simple-minded hash to identify a registry. + + Notes + ----- + Why the strange hash? + - **All** generated schemas hit the ``"$ref"`` branch. + - ``api.Then`` hits the len(...) 1 branch w/ ``{"type": "object"}``. + - Final branch is only hit by mock schemas in: + - `tests/utils/test_core.py::test_infer_encoding_types` + - `tests/utils/test_schemapi.py` + """ + if "$ref" in root: + k1 = root["$ref"] + elif len(root) == 1: + k1 = "".join(f"{s!s}" for s in chain(*root.items())) + else: + k1 = _HASH_ENCODER.encode(root) + return k1, dialect_id + + @classmethod + def update_cached( + cls, root: Map, dialect_id: str, resolver: Resolver[Any] + ) -> None: + cls._cached[cls.compute_key(root, dialect_id)] = resolver._registry + + def __call__(self, root: Map, dialect_id: str, /) -> Registry[Any]: + """ + Constructs a `Registry`_, adding the `Resource`_ produced by ``rootschema``. + + Requires at least ``jsonschema`` `v4.18.0a1`_. + + .. _Registry: + https://referencing.readthedocs.io/en/stable/api/#referencing.Registry + .. _Resource: + https://referencing.readthedocs.io/en/stable/api/#referencing.Resource + .. _v4.18.0a1: + https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + """ + cache_key = self.compute_key(root, dialect_id) + if (reg := self._cached.get(cache_key, None)) is not None: + return reg + resource = specification_with(dialect_id).create_resource(root) + reg = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource).crawl() + type(self)._cached[cache_key] = reg + return reg + + registry: _Registry = _Registry() + def _validator(schema: Map, rootschema: Map | None = None, /) -> Validator: """ Constructs a `Validator`_ for future validation. @@ -311,37 +379,9 @@ def _validator(schema: Map, rootschema: Map | None = None, /) -> Validator: # NOTE: This is the current version uri = _get_schema_dialect_uri(rootschema or schema) validator = _validator_for(uri) - registry = _registry(rootschema or schema, uri) - return validator(_prepare_references(schema), registry=registry) - - def _registry(rootschema: Map, dialect_id: str) -> Registry[Any]: - """ - Constructs a `Registry`_, adding the `Resource`_ produced by ``rootschema``. - - Requires at least ``jsonschema`` `v4.18.0a1`_. - - .. _Registry: - https://referencing.readthedocs.io/en/stable/api/#referencing.Registry - .. _Resource: - https://referencing.readthedocs.io/en/stable/api/#referencing.Resource - .. _v4.18.0a1: - https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 - """ - global _REGISTRY_CACHE - cache_key = _registry_comp_key(rootschema, dialect_id) - if (registry := _REGISTRY_CACHE.get(cache_key, None)) is not None: - return registry - else: - specification = specification_with(dialect_id) - resource = specification.create_resource(rootschema) - registry = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource).crawl() - _REGISTRY_CACHE[cache_key] = registry - return registry - - def _registry_update(root: Map, dialect_id: str, resolver: Resolver[Any]) -> None: - global _REGISTRY_CACHE - cache_key = _registry_comp_key(root, dialect_id) - _REGISTRY_CACHE[cache_key] = resolver._registry + return validator( + _prepare_references(schema), registry=registry(rootschema or schema, uri) + ) def _resolve_references(schema: Map, rootschema: Map) -> Map: """ @@ -359,36 +399,13 @@ def _resolve_references(schema: Map, rootschema: Map) -> Map: if ("$ref" not in root) or ("$ref" not in schema): return schema uri = _get_schema_dialect_uri(rootschema) - registry = _registry(root, uri) - resolver = registry.resolver(_VEGA_LITE_ROOT_URI) + resolver = registry(root, uri).resolver(_VEGA_LITE_ROOT_URI) while "$ref" in schema: resolved = resolver.lookup(schema["$ref"]) schema = resolved.contents - _registry_update(root, uri, resolved.resolver) + registry.update_cached(root, uri, resolved.resolver) return schema - def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: - """ - Generate a simple-minded hash to identify a registry. - - Notes - ----- - Why the strange hash? - - **All** generated schemas hit the ``"$ref"`` branch. - - ``api.Then`` hits the len(...) 1 branch w/ ``{"type": "object"}``. - - Final branch is only hit by mock schemas in: - - `tests/utils/test_core.py::test_infer_encoding_types` - - `tests/utils/test_schemapi.py` - """ - if "$ref" in root: - k1 = root["$ref"] - elif len(root) == 1: - k1 = "".join(f"{s!s}" for s in chain(*root.items())) - else: - k1 = _HASH_ENCODER.encode(root) - return k1, dialect_id - - _REGISTRY_CACHE: dict[tuple[str, str], Registry[Any]] = {} else: diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 483fb88e5..2320ceb1d 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -290,6 +290,74 @@ def specification_with(dialect_id: str, /) -> Specification[Any]: """ return _specification_with(dialect_id) + class _Registry: + """ + A cache of `Registry`_ (s). + + An instance named ``registry`` is used to wrap the `Registry`_ API, + with a managed cache. + + See Also + -------- + ``_Registry.__call__`` + + .. _Registry: + https://referencing.readthedocs.io/en/stable/api/#referencing.Registry + """ + + _cached: ClassVar[dict[tuple[str, str], Registry[Any]]] = {} + + @staticmethod + def compute_key(root: Map, dialect_id: str, /) -> tuple[str, str]: + """ + Generate a simple-minded hash to identify a registry. + + Notes + ----- + Why the strange hash? + - **All** generated schemas hit the ``"$ref"`` branch. + - ``api.Then`` hits the len(...) 1 branch w/ ``{"type": "object"}``. + - Final branch is only hit by mock schemas in: + - `tests/utils/test_core.py::test_infer_encoding_types` + - `tests/utils/test_schemapi.py` + """ + if "$ref" in root: + k1 = root["$ref"] + elif len(root) == 1: + k1 = "".join(f"{s!s}" for s in chain(*root.items())) + else: + k1 = _HASH_ENCODER.encode(root) + return k1, dialect_id + + @classmethod + def update_cached( + cls, root: Map, dialect_id: str, resolver: Resolver[Any] + ) -> None: + cls._cached[cls.compute_key(root, dialect_id)] = resolver._registry + + def __call__(self, root: Map, dialect_id: str, /) -> Registry[Any]: + """ + Constructs a `Registry`_, adding the `Resource`_ produced by ``rootschema``. + + Requires at least ``jsonschema`` `v4.18.0a1`_. + + .. _Registry: + https://referencing.readthedocs.io/en/stable/api/#referencing.Registry + .. _Resource: + https://referencing.readthedocs.io/en/stable/api/#referencing.Resource + .. _v4.18.0a1: + https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 + """ + cache_key = self.compute_key(root, dialect_id) + if (reg := self._cached.get(cache_key, None)) is not None: + return reg + resource = specification_with(dialect_id).create_resource(root) + reg = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource).crawl() + type(self)._cached[cache_key] = reg + return reg + + registry: _Registry = _Registry() + def _validator(schema: Map, rootschema: Map | None = None, /) -> Validator: """ Constructs a `Validator`_ for future validation. @@ -309,37 +377,9 @@ def _validator(schema: Map, rootschema: Map | None = None, /) -> Validator: # NOTE: This is the current version uri = _get_schema_dialect_uri(rootschema or schema) validator = _validator_for(uri) - registry = _registry(rootschema or schema, uri) - return validator(_prepare_references(schema), registry=registry) - - def _registry(rootschema: Map, dialect_id: str) -> Registry[Any]: - """ - Constructs a `Registry`_, adding the `Resource`_ produced by ``rootschema``. - - Requires at least ``jsonschema`` `v4.18.0a1`_. - - .. _Registry: - https://referencing.readthedocs.io/en/stable/api/#referencing.Registry - .. _Resource: - https://referencing.readthedocs.io/en/stable/api/#referencing.Resource - .. _v4.18.0a1: - https://github.com/python-jsonschema/jsonschema/releases/tag/v4.18.0a1 - """ - global _REGISTRY_CACHE - cache_key = _registry_comp_key(rootschema, dialect_id) - if (registry := _REGISTRY_CACHE.get(cache_key, None)) is not None: - return registry - else: - specification = specification_with(dialect_id) - resource = specification.create_resource(rootschema) - registry = Registry().with_resource(_VEGA_LITE_ROOT_URI, resource).crawl() - _REGISTRY_CACHE[cache_key] = registry - return registry - - def _registry_update(root: Map, dialect_id: str, resolver: Resolver[Any]) -> None: - global _REGISTRY_CACHE - cache_key = _registry_comp_key(root, dialect_id) - _REGISTRY_CACHE[cache_key] = resolver._registry + return validator( + _prepare_references(schema), registry=registry(rootschema or schema, uri) + ) def _resolve_references(schema: Map, rootschema: Map) -> Map: """ @@ -357,36 +397,13 @@ def _resolve_references(schema: Map, rootschema: Map) -> Map: if ("$ref" not in root) or ("$ref" not in schema): return schema uri = _get_schema_dialect_uri(rootschema) - registry = _registry(root, uri) - resolver = registry.resolver(_VEGA_LITE_ROOT_URI) + resolver = registry(root, uri).resolver(_VEGA_LITE_ROOT_URI) while "$ref" in schema: resolved = resolver.lookup(schema["$ref"]) schema = resolved.contents - _registry_update(root, uri, resolved.resolver) + registry.update_cached(root, uri, resolved.resolver) return schema - def _registry_comp_key(root: Map, dialect_id: str, /) -> tuple[str, str]: - """ - Generate a simple-minded hash to identify a registry. - - Notes - ----- - Why the strange hash? - - **All** generated schemas hit the ``"$ref"`` branch. - - ``api.Then`` hits the len(...) 1 branch w/ ``{"type": "object"}``. - - Final branch is only hit by mock schemas in: - - `tests/utils/test_core.py::test_infer_encoding_types` - - `tests/utils/test_schemapi.py` - """ - if "$ref" in root: - k1 = root["$ref"] - elif len(root) == 1: - k1 = "".join(f"{s!s}" for s in chain(*root.items())) - else: - k1 = _HASH_ENCODER.encode(root) - return k1, dialect_id - - _REGISTRY_CACHE: dict[tuple[str, str], Registry[Any]] = {} else: From 82a81061a28e2554f45c40f4d3a902d84c9337fe Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:19:43 +0100 Subject: [PATCH 88/92] chore: Add `_is_valid` to `schemapi.__all__` Used in `Chart.from_dict` --- altair/utils/schemapi.py | 1 + tools/schemapi/schemapi.py | 1 + 2 files changed, 2 insertions(+) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 6f05ff19f..692e7a444 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -79,6 +79,7 @@ "SchemaBase", # altair.vegalite.v5.schema.core "Undefined", # altair.utils "UndefinedType", # altair.vegalite.v5.schema.core -> (side-effect relied on to propagate to alt.__init__) + "_is_valid", # altair.vegalite.v5.api "_resolve_references", # tools.schemapi.utils -> tools.generate_schema_wrapper "_subclasses", # altair.vegalite.v5.schema.core "is_undefined", # altair.typing diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 2320ceb1d..f7ac217a4 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -77,6 +77,7 @@ "SchemaBase", # altair.vegalite.v5.schema.core "Undefined", # altair.utils "UndefinedType", # altair.vegalite.v5.schema.core -> (side-effect relied on to propagate to alt.__init__) + "_is_valid", # altair.vegalite.v5.api "_resolve_references", # tools.schemapi.utils -> tools.generate_schema_wrapper "_subclasses", # altair.vegalite.v5.schema.core "is_undefined", # altair.typing From af783b2031aedd12fb0c3074bbfd41d618be7236 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:38:41 +0100 Subject: [PATCH 89/92] chore: Remove more comments All were notes added earlier in PR, but not needed now --- altair/utils/schemapi.py | 14 -------------- tests/vegalite/v5/test_api.py | 3 --- tools/schemapi/schemapi.py | 14 -------------- 3 files changed, 31 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 692e7a444..41db4fee7 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1044,14 +1044,6 @@ def __init_subclass__( **kwds: Any, ) -> None: super().__init_subclass__(*args, **kwds) - # NOTE: `SchemaBase` itself would have no `_schema` or `_rootschema`, but won't be run through this - # FIXED: `VegaLiteSchema` has a `_rootschema` but no `_schema` - # FIXED: `Root` uses `VegaLiteSchema._rootschema`, for `_schema` and inherits the same for `_rootschema` - # FIXED: Both have only `_schema` - which is a type - # - `api.Then`: _schema = {"type": "object"} - # - `expr.core.Expression`: _schema = {"type": "string"} - # ---- - # All others either *only* define `_schema`, or inherit it when they are a channel if schema is None: if hasattr(cls, "_schema"): schema = cls._schema @@ -1061,7 +1053,6 @@ def __init_subclass__( "_schema class attribute is not defined." ) raise TypeError(msg) - if rootschema is None: if hasattr(cls, "_rootschema"): rootschema = cls._rootschema @@ -1070,11 +1061,6 @@ def __init_subclass__( else: msg = "`rootschema` must be provided if `schema` contains a `'$ref'` and does not inherit one." raise TypeError(msg) - - # NOTE: Inherit a `False`instead of overwriting with the default `True` - # - If a parent is not valid at init, then none of its subclasses can be - # - The current hierarchy does not support the inverse of this - # - Subclasses may declare they are not valid if valid_at_init is None: valid_at_init = cls._class_is_valid_at_instantiation cls._schema = schema diff --git a/tests/vegalite/v5/test_api.py b/tests/vegalite/v5/test_api.py index 48cbe0367..3278f8f62 100644 --- a/tests/vegalite/v5/test_api.py +++ b/tests/vegalite/v5/test_api.py @@ -1228,9 +1228,6 @@ def test_themes(): assert "config" not in chart.to_dict() -# TODO: Investigate alternative to looped try/except/pass -# - AFAIK it would speed up `Chart.from_dict()` -# - but maybe not central enough to have general impact def test_chart_from_dict() -> None: base = alt.Chart("data.csv").mark_point().encode(x="x:Q", y="y:Q") diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index f7ac217a4..ec5a6d21f 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1042,14 +1042,6 @@ def __init_subclass__( **kwds: Any, ) -> None: super().__init_subclass__(*args, **kwds) - # NOTE: `SchemaBase` itself would have no `_schema` or `_rootschema`, but won't be run through this - # FIXED: `VegaLiteSchema` has a `_rootschema` but no `_schema` - # FIXED: `Root` uses `VegaLiteSchema._rootschema`, for `_schema` and inherits the same for `_rootschema` - # FIXED: Both have only `_schema` - which is a type - # - `api.Then`: _schema = {"type": "object"} - # - `expr.core.Expression`: _schema = {"type": "string"} - # ---- - # All others either *only* define `_schema`, or inherit it when they are a channel if schema is None: if hasattr(cls, "_schema"): schema = cls._schema @@ -1059,7 +1051,6 @@ def __init_subclass__( "_schema class attribute is not defined." ) raise TypeError(msg) - if rootschema is None: if hasattr(cls, "_rootschema"): rootschema = cls._rootschema @@ -1068,11 +1059,6 @@ def __init_subclass__( else: msg = "`rootschema` must be provided if `schema` contains a `'$ref'` and does not inherit one." raise TypeError(msg) - - # NOTE: Inherit a `False`instead of overwriting with the default `True` - # - If a parent is not valid at init, then none of its subclasses can be - # - The current hierarchy does not support the inverse of this - # - Subclasses may declare they are not valid if valid_at_init is None: valid_at_init = cls._class_is_valid_at_instantiation cls._schema = schema From 9a48448e334c53d2c431bb9a79eb397bc7c0e7d7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:52:11 +0100 Subject: [PATCH 90/92] refactor: Remove `_SchemaBasePEP487` & test suite WIll put this on another branch, won't improve performance so not relevant here https://github.com/vega/altair/pull/3547#issuecomment-2315914787 --- altair/utils/schemapi.py | 112 ------------------------- tests/utils/test_schemapi.py | 154 ----------------------------------- tools/schemapi/schemapi.py | 112 ------------------------- 3 files changed, 378 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 41db4fee7..37335ee0d 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1010,118 +1010,6 @@ def _deep_copy(obj: _CopyImpl | Any, by_ref: set[str]) -> _CopyImpl | Any: return obj -class _SchemaBasePEP487: - """Minimal demo for testing feasibility of `__init_subclass__`.""" - - _schema: ClassVar[dict[str, Any]] - _rootschema: ClassVar[dict[str, Any]] - _class_is_valid_at_instantiation: ClassVar[bool] = True - - def __init__(self, *args: Any, **kwds: Any) -> None: - if (kwds and args) or len(args) > 1: - name = type(self).__name__ - _args = ", ".join(f"{a!r}" for a in args) - _kwds = ", ".join(f"{k}={v!r}" for k, v in kwds.items()) - msg = ( - f"Expected either:\n" - f" - a single arg with no kwds, for, e.g. {{'type': 'string'}}\n" - f" - zero args with zero or more kwds for {{'type': 'object'}}\n\n" - f"but got: {name}({_args}, {_kwds})" - ) - raise AssertionError(msg) - # use object.__setattr__ because we override setattr below. - self._args: tuple[Any, ...] - self._kwds: dict[str, Any] - object.__setattr__(self, "_args", args) - object.__setattr__(self, "_kwds", kwds) - - def __init_subclass__( - cls, - *args: Any, - schema: dict[str, Any] | None = None, - rootschema: dict[str, Any] | None = None, - valid_at_init: bool | None = None, - **kwds: Any, - ) -> None: - super().__init_subclass__(*args, **kwds) - if schema is None: - if hasattr(cls, "_schema"): - schema = cls._schema - else: - msg = ( - f"Cannot instantiate object of type {cls}: " - "_schema class attribute is not defined." - ) - raise TypeError(msg) - if rootschema is None: - if hasattr(cls, "_rootschema"): - rootschema = cls._rootschema - elif "$ref" not in schema: - rootschema = schema - else: - msg = "`rootschema` must be provided if `schema` contains a `'$ref'` and does not inherit one." - raise TypeError(msg) - if valid_at_init is None: - valid_at_init = cls._class_is_valid_at_instantiation - cls._schema = schema - cls._rootschema = rootschema - cls._class_is_valid_at_instantiation = valid_at_init - - @overload - def _get(self, attr: str, default: Optional = ...) -> Any | UndefinedType: ... - @overload - def _get(self, attr: str, default: T) -> Any | T: ... - def _get(self, attr: str, default: Optional[T] = Undefined) -> Any | T: - """Get an attribute, returning default if not present.""" - if (item := self._kwds.get(attr, Undefined)) is not Undefined: - return item - else: - return default - - def __dir__(self) -> list[str]: - return sorted(chain(super().__dir__(), self._kwds)) - - def __eq__(self, other: Any) -> bool: - return ( - type(self) is type(other) - and self._args == other._args - and self._kwds == other._kwds - ) - - def __getattr__(self, attr: str): - # reminder: getattr is called after the normal lookups - if attr == "_kwds": - raise AttributeError() - if attr in self._kwds: - return self._kwds[attr] - else: - return getattr(super(), "__getattr__", super().__getattribute__)(attr) - - def __getitem__(self, item: str) -> Any: - return self._kwds[item] - - def __setattr__(self, item: str, val: Any) -> None: - if item.startswith("_"): - # Setting an instances copy of a ClassVar modify that - # By default, this makes **another** copy and places in _kwds - object.__setattr__(self, item, val) - else: - self._kwds[item] = val - - def __setitem__(self, item: str, val: Any) -> None: - self._kwds[item] = val - - def __repr__(self) -> str: - name = type(self).__name__ - if kwds := self._kwds: - it = (f"{k}: {v!r}" for k, v in sorted(kwds.items()) if v is not Undefined) - args = ",\n".join(it).replace("\n", "\n ") - LB, RB = "{", "}" - return f"{name}({LB}\n {args}\n{RB})" - else: - return f"{name}({self._args[0]!r})" - - class SchemaBase: """ Base class for schema wrappers. diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 1fce07009..3bc42a328 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -10,7 +10,6 @@ import warnings from collections import deque from functools import partial -from importlib.metadata import version as importlib_version from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Literal, Sequence import jsonschema @@ -19,7 +18,6 @@ import pandas as pd import polars as pl import pytest -from packaging.version import Version import altair as alt from altair import load_schema @@ -37,34 +35,6 @@ # try to use SchemaBase objects defined elsewhere as wrappers. -@pytest.fixture -def dummy_rootschema() -> dict[str, Any]: - return { - "$schema": _JSON_SCHEMA_DRAFT_URL, - "definitions": { - "StringMapping": { - "type": "object", - "additionalProperties": {"type": "string"}, - }, - "StringArray": {"type": "array", "items": {"type": "string"}}, - }, - "properties": { - "a": {"$ref": "#/definitions/StringMapping"}, - "a2": {"type": "object", "additionalProperties": {"type": "number"}}, - "b": {"$ref": "#/definitions/StringArray"}, - "b2": {"type": "array", "items": {"type": "number"}}, - "c": {"type": ["string", "number"]}, - "d": { - "anyOf": [ - {"$ref": "#/definitions/StringMapping"}, - {"$ref": "#/definitions/StringArray"}, - ] - }, - "e": {"items": [{"type": "string"}, {"type": "string"}]}, - }, - } - - def test_actual_json_schema_draft_is_same_as_hardcoded_default(): # See comments next to definition of `_DEFAULT_DIALECT_URI` # for details why we need this test @@ -75,130 +45,6 @@ def test_actual_json_schema_draft_is_same_as_hardcoded_default(): ) -def test_init_subclasses_hierarchy(dummy_rootschema) -> None: - if Version(importlib_version("jsonschema")) >= Version("4.18"): - from referencing.exceptions import Unresolvable - else: - from jsonschema.exceptions import ( # type: ignore[assignment] - RefResolutionError as Unresolvable, - ) - - from altair.expr.core import GetItemExpression, OperatorMixin - from altair.utils.schemapi import _SchemaBasePEP487 - - sch1 = _SchemaBasePEP487() - sch2 = _SchemaBasePEP487() - sch3 = _SchemaBasePEP487("blue") - sch4 = _SchemaBasePEP487("red") - sch5 = _SchemaBasePEP487(color="blue") - sch6 = _SchemaBasePEP487(color="red") - - with pytest.raises( - AssertionError, match=r"_SchemaBasePEP487\('blue', color='red'\)" - ): - _SchemaBasePEP487("blue", color="red") - - assert sch1 == sch2 - assert sch3 != sch4 - assert sch5 != sch6 - assert sch3 != sch5 - assert _SchemaBasePEP487("blue") == sch3 - assert _SchemaBasePEP487(color="red") == sch6 - with pytest.raises(AttributeError, match="_SchemaBasePEP487.+color"): - attempt = sch4.color is Undefined # noqa: F841 - - assert sch5.color == sch5["color"] == sch5._get("color") == "blue" - assert sch5._get("price") is Undefined - assert sch5._get("price", 999) == 999 - - assert _SchemaBasePEP487._class_is_valid_at_instantiation - sch6._class_is_valid_at_instantiation = False # type: ignore[misc] - assert ( - _SchemaBasePEP487._class_is_valid_at_instantiation - != sch6._class_is_valid_at_instantiation - ) - - with pytest.raises(TypeError, match="Test1PEP487.+ _schema"): - - class Test1PEP487(_SchemaBasePEP487): ... - - class Test2PEP487(_SchemaBasePEP487, schema={"type": "object"}): ... - - with pytest.raises( - TypeError, - match=r"`rootschema` must be provided if `schema` contains a `'\$ref'` and does not inherit one", - ): - - class Test3PEP487(_SchemaBasePEP487, schema={"$ref": "#/definitions/Bar"}): ... - - class RootParentPEP487(_SchemaBasePEP487, schema=dummy_rootschema): - @classmethod - def _default_wrapper_classes(cls) -> Iterator[type[Any]]: - return schemapi._subclasses(RootParentPEP487) - - class Root(RootParentPEP487): - """ - Root schema wrapper. - - A Vega-Lite top-level specification. This is the root class for all Vega-Lite - specifications. (The json schema is generated from this type.) - """ - - def __init__(self, *args, **kwds) -> None: - super().__init__(*args, **kwds) - - assert ( - Root._schema - == Root._rootschema - == RootParentPEP487._schema - == RootParentPEP487._rootschema - ) - - class StringMapping(Root, schema={"$ref": "#/definitions/StringMapping"}): ... - - class StringArray(Root, schema={"$ref": "#/definitions/StringArray"}): ... - - with pytest.raises( - jsonschema.ValidationError, - match=r"5 is not of type 'string'", - ): - schemapi.validate_jsonschema( - ["one", "two", 5], StringArray._schema, StringArray._rootschema - ) - - with pytest.raises(Unresolvable): - schemapi.validate_jsonschema(["one", "two", "three"], StringArray._schema) - - schemapi.validate_jsonschema( - ["one", "two", "three"], StringArray._schema, StringArray._rootschema - ) - - class Expression(OperatorMixin, _SchemaBasePEP487, schema={"type": "string"}): - def to_dict(self, *args, **kwargs): - return repr(self) - - def __setattr__(self, attr, val) -> None: - # We don't need the setattr magic defined in SchemaBase - return object.__setattr__(self, attr, val) - - def __getitem__(self, val): - return GetItemExpression(self, val) - - non_ref_mixin = Expression( - Expression("some").to_dict() + Expression("more").to_dict() - ) - schemapi.validate_jsonschema( - non_ref_mixin.to_dict(), non_ref_mixin._schema, non_ref_mixin._rootschema - ) - with pytest.raises( - jsonschema.ValidationError, - match=r"is not of type 'array'", - ): - schemapi.validate_jsonschema( - non_ref_mixin.to_dict(), StringArray._schema, StringArray._rootschema - ) - - class _TestSchema(SchemaBase): @classmethod def _default_wrapper_classes(cls): diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index ec5a6d21f..68a19b44a 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -1008,118 +1008,6 @@ def _deep_copy(obj: _CopyImpl | Any, by_ref: set[str]) -> _CopyImpl | Any: return obj -class _SchemaBasePEP487: - """Minimal demo for testing feasibility of `__init_subclass__`.""" - - _schema: ClassVar[dict[str, Any]] - _rootschema: ClassVar[dict[str, Any]] - _class_is_valid_at_instantiation: ClassVar[bool] = True - - def __init__(self, *args: Any, **kwds: Any) -> None: - if (kwds and args) or len(args) > 1: - name = type(self).__name__ - _args = ", ".join(f"{a!r}" for a in args) - _kwds = ", ".join(f"{k}={v!r}" for k, v in kwds.items()) - msg = ( - f"Expected either:\n" - f" - a single arg with no kwds, for, e.g. {{'type': 'string'}}\n" - f" - zero args with zero or more kwds for {{'type': 'object'}}\n\n" - f"but got: {name}({_args}, {_kwds})" - ) - raise AssertionError(msg) - # use object.__setattr__ because we override setattr below. - self._args: tuple[Any, ...] - self._kwds: dict[str, Any] - object.__setattr__(self, "_args", args) - object.__setattr__(self, "_kwds", kwds) - - def __init_subclass__( - cls, - *args: Any, - schema: dict[str, Any] | None = None, - rootschema: dict[str, Any] | None = None, - valid_at_init: bool | None = None, - **kwds: Any, - ) -> None: - super().__init_subclass__(*args, **kwds) - if schema is None: - if hasattr(cls, "_schema"): - schema = cls._schema - else: - msg = ( - f"Cannot instantiate object of type {cls}: " - "_schema class attribute is not defined." - ) - raise TypeError(msg) - if rootschema is None: - if hasattr(cls, "_rootschema"): - rootschema = cls._rootschema - elif "$ref" not in schema: - rootschema = schema - else: - msg = "`rootschema` must be provided if `schema` contains a `'$ref'` and does not inherit one." - raise TypeError(msg) - if valid_at_init is None: - valid_at_init = cls._class_is_valid_at_instantiation - cls._schema = schema - cls._rootschema = rootschema - cls._class_is_valid_at_instantiation = valid_at_init - - @overload - def _get(self, attr: str, default: Optional = ...) -> Any | UndefinedType: ... - @overload - def _get(self, attr: str, default: T) -> Any | T: ... - def _get(self, attr: str, default: Optional[T] = Undefined) -> Any | T: - """Get an attribute, returning default if not present.""" - if (item := self._kwds.get(attr, Undefined)) is not Undefined: - return item - else: - return default - - def __dir__(self) -> list[str]: - return sorted(chain(super().__dir__(), self._kwds)) - - def __eq__(self, other: Any) -> bool: - return ( - type(self) is type(other) - and self._args == other._args - and self._kwds == other._kwds - ) - - def __getattr__(self, attr: str): - # reminder: getattr is called after the normal lookups - if attr == "_kwds": - raise AttributeError() - if attr in self._kwds: - return self._kwds[attr] - else: - return getattr(super(), "__getattr__", super().__getattribute__)(attr) - - def __getitem__(self, item: str) -> Any: - return self._kwds[item] - - def __setattr__(self, item: str, val: Any) -> None: - if item.startswith("_"): - # Setting an instances copy of a ClassVar modify that - # By default, this makes **another** copy and places in _kwds - object.__setattr__(self, item, val) - else: - self._kwds[item] = val - - def __setitem__(self, item: str, val: Any) -> None: - self._kwds[item] = val - - def __repr__(self) -> str: - name = type(self).__name__ - if kwds := self._kwds: - it = (f"{k}: {v!r}" for k, v in sorted(kwds.items()) if v is not Undefined) - args = ",\n".join(it).replace("\n", "\n ") - LB, RB = "{", "}" - return f"{name}({LB}\n {args}\n{RB})" - else: - return f"{name}({self._args[0]!r})" - - class SchemaBase: """ Base class for schema wrappers. From 67642f06d7a1c4c28a05ac8c5dda78dd2f01a1d5 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:50:13 +0100 Subject: [PATCH 91/92] ci: Remove debugging `hatch` script --- pyproject.toml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 91fdb0218..bbfce770f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -138,13 +138,6 @@ test-slow = [ "pytest -p no:randomly -n logical --numprocesses=logical --doctest-modules tests altair -m \"slow\" {args}" ] -# Much more isolated, focused purely on a faster `schemapi.py` rebuild/test loop. -validation = [ - "mypy tools/schemapi/schemapi.py", - "python tools/generate_schema_wrapper.py", - "pytest -k test_schemapi tests {args}", -] - [tool.hatch.envs.hatch-test] # https://hatch.pypa.io/latest/tutorials/testing/overview/ features = ["all", "dev", "doc"] From 8002dab1f4ebea29183921ba9cd02a21e870f16c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:54:40 +0100 Subject: [PATCH 92/92] test: Remove `test_chart_validation_benchmark` Feeling I've squeezed out all the performance I can for now. Will add in a collpased comment on the PR for reference --- tests/utils/test_schemapi.py | 92 +----------------------------------- 1 file changed, 1 insertion(+), 91 deletions(-) diff --git a/tests/utils/test_schemapi.py b/tests/utils/test_schemapi.py index 3bc42a328..25f483753 100644 --- a/tests/utils/test_schemapi.py +++ b/tests/utils/test_schemapi.py @@ -10,7 +10,7 @@ import warnings from collections import deque from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Literal, Sequence +from typing import TYPE_CHECKING, Any, Callable, Iterable, Sequence import jsonschema import jsonschema.exceptions @@ -882,96 +882,6 @@ def test_chart_validation_errors(chart_func, expected_error_message): chart.to_dict() -_SKIP_SLOW_BENCHMARKS: bool = True -_REPEAT_TIMES = 1000 - - -@pytest.mark.parametrize("to_or_from", ["to_dict-validate", "to_dict", "from_dict"]) -@pytest.mark.filterwarnings("ignore:.*:UserWarning") -@pytest.mark.skipif( - _SKIP_SLOW_BENCHMARKS, - reason="Should only be run in isolation to test single threaded performance.", -) -def test_chart_validation_benchmark( - to_or_from: Literal["to_dict-validate", "to_dict", "from_dict"], -) -> None: - """ - Intended to isolate `Chart.(to|from)_dict.` calls. - - Repeated ``_REPEAT_TIMES`` times, non-parametric: - - in an attempt to limit the potential overhead of ``pytest`` - - but enforce ``1`` thread, like a user-code would be. - - Results - ------- - ``` - _REPEAT_TIMES = 1000 - pytest -k test_chart_validation_benchmark --numprocesses=3 --durations=3 tests - - # Pre-`SchemaBase.from_dict` refactor (3.12.3) - 108.16s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] - 84.62s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] - 66.71s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] - - # Post-`SchemaBase.from_dict` refactor (3.12.3) - 107.84s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] - 50.43s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] - 67.07s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] - - # Post-`SchemaBase.__init_subclass__` addition (3.12.3) - 108.24s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] - 50.33s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] - 66.51s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] - - # Post-`dict` branch micro optimization in `_FromDict.from_dict` (3.12.3) - 107.90s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict-validate] - 49.63s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[from_dict] - 66.87s call tests/utils/test_schemapi.py::test_chart_validation_benchmark[to_dict] - ``` - """ - from itertools import chain, repeat - - if TYPE_CHECKING: - from altair.typing import ChartType - - def _iter_charts() -> Iterator[ChartType]: - """ - Ensures only len(chart_funcs_error_message) actual charts are constructed. - - The `to_dict` calls are what gets multiplied - """ - charts: list[ChartType] = [fn() for fn, _ in chart_funcs_error_message] - yield from chain.from_iterable(repeat(charts, times=_REPEAT_TIMES)) - - def _iter_chart_factory() -> Iterator[ChartType]: - """ - Validation not the bottleneck, but encode is. - - Ensures at least `times` * len(chart_funcs_error_message) .encode calls are made. - """ - chart_funcs: list[Callable[[], ChartType]] = [ - fn for fn, _ in chart_funcs_error_message - ] - for fn in chain.from_iterable(repeat(chart_funcs, times=_REPEAT_TIMES)): - yield fn() - - def _to_dict(validate: bool) -> None: - if validate: - for chart in _iter_charts(): - with pytest.raises(schemapi.SchemaValidationError): - chart.to_dict(validate=validate) - else: - for chart in _iter_charts(): - chart.to_dict(validate=validate) - - if to_or_from == "to_dict": - _to_dict(validate=False) - elif to_or_from == "to_dict-validate": - _to_dict(validate=True) - else: - assert list(_iter_chart_factory()) - - def test_multiple_field_strings_in_condition(): selection = alt.selection_point() expected_error_message = "A field cannot be used for both the `if_true` and `if_false` values of a condition. One of them has to specify a `value` or `datum` definition."