Skip to content

Commit

Permalink
Schema Update (#262)
Browse files Browse the repository at this point in the history
* implement schema validator migration

* port basic merge functionality

* small changes to resolve broken tests

* test  in real schema

* give beta

* review changes, flake

* one more flake fix

* give real versoin

* small changes, additional test
  • Loading branch information
willronchetti authored Aug 4, 2023
1 parent 89ad813 commit 9df20a9
Show file tree
Hide file tree
Showing 43 changed files with 1,980 additions and 756 deletions.
13 changes: 12 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,29 @@ snovault
Change Log
----------

9.1.1
10.0.0
======

* Updates ``jsonschema`` version, removing dependency on ``jsonschema-serialize-fork`` and allowing
us to use ``$merge`` refs.
* Breaking Change: dependencies --> dependentRequired in schema
* Breaking Change: object serialization in schema no longer valid


9.1.1
=====

* Small fix for JWT Decode incompatible change

9.1.0
=====

* Fix for MIME type ordering in renderers.py (differs between cgap and fourfront).


9.0.0
=====

* Merge/unify ingestion and other code from cgap-portal and fourfront.


Expand Down
1,658 changes: 948 additions & 710 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicsnovault"
version = "9.1.1"
version = "10.0.0"
description = "Storage support for 4DN Data Portals."
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down Expand Up @@ -46,7 +46,6 @@ dcicutils = "^7.5.0"
future = ">=0.15.2,<1"
html5lib = ">=1.1" # experimental, should be OK now that we're not using moto server
humanfriendly = "^1.44.9"
jsonschema_serialize_fork = "^2.1.1"
netaddr = ">=0.8.0,<1"
passlib = "^1.7.4"
pillow = "^9.5.0"
Expand Down Expand Up @@ -83,6 +82,7 @@ xlrd = "^1.0.0"
"zope.deprecation" = "^4.4.0"
"zope.interface" = ">=4.7.2,<6"
"zope.sqlalchemy" = "1.6"
jsonschema = "^4.18.4"

[tool.poetry.dev-dependencies]
botocore-stubs = ">=1.29.119" # no particular version required, but this speeds up search
Expand Down
5 changes: 5 additions & 0 deletions snovault/elasticsearch/create_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
get_uuids_for_types,
SCAN_PAGE_SIZE,
)
from ..schema_utils import load_schema
from .interfaces import ELASTIC_SEARCH, INDEXER_QUEUE
from ..settings import Settings

Expand Down Expand Up @@ -104,6 +105,8 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
TODO: rename 'lower_case_sort' to 'lowercase' and adjust search code
"""
ignored(top_level) # TODO: maybe wants to be used below, but isn't yet?
if '$merge' in schema:
schema = load_schema(schema)
type_ = schema['type']

# Elasticsearch handles multiple values for a field
Expand Down Expand Up @@ -715,6 +718,8 @@ def type_mapping(types, item_type, embed=True):
# to relevant fields so that they are not mapped into full_text, for example.
properties = schema['properties']
for _, sub_mapping in properties.items():
if '$merge' in sub_mapping:
sub_mapping = load_schema(sub_mapping)
if sub_mapping['type'] == 'text':
sub_mapping['copy_to'] = ['full_text']
return mapping
Expand Down
4 changes: 2 additions & 2 deletions snovault/elasticsearch/indexer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,8 +414,8 @@ def compute_invalidation_scope(context, request):
# Note 'type' is desired here because concrete types have literal type TypeInfo
# vs. abstract types have literal type AbstractTypeInfo
# isinstance() will return True (wrong) since TypeInfo inherits from AbstractTypeInfo
if type(request.registry[TYPES][source_type]) == AbstractTypeInfo or \
type(request.registry[TYPES][target_type]) == AbstractTypeInfo:
if type(request.registry[TYPES][source_type]) is AbstractTypeInfo or \
type(request.registry[TYPES][target_type]) is AbstractTypeInfo:
raise HTTPBadRequest('One or more of your types is abstract! %s/%s' % (source_type, target_type))
source_type_schema = request.registry[TYPES][source_type].schema
result = {
Expand Down
2 changes: 1 addition & 1 deletion snovault/schema_formats.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re

from jsonschema_serialize_fork import FormatChecker
from jsonschema import FormatChecker
from .server_defaults import (
ACCESSION_PREFIX,
ACCESSION_TEST_PREFIX,
Expand Down
106 changes: 88 additions & 18 deletions snovault/schema_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@

from datetime import datetime
from dcicutils.misc_utils import ignored
from jsonschema_serialize_fork import (
Draft4Validator,
FormatChecker,
RefResolver,
)
from jsonschema_serialize_fork.exceptions import ValidationError
from snovault.schema_validation import SerializingSchemaValidator
from jsonschema import FormatChecker
from jsonschema import RefResolver
from jsonschema.exceptions import ValidationError
import os
from pyramid.path import AssetResolver, caller_package
from pyramid.threadlocal import get_current_request
Expand Down Expand Up @@ -42,7 +40,13 @@ def server_default(func):

class NoRemoteResolver(RefResolver):
def resolve_remote(self, uri):
raise ValueError('Resolution disallowed for: %s' % uri)
""" Resolves remote uri for files so we can cross reference across our own
repos, which now contain base schemas we may want to use
"""
if any(s in uri for s in ['http', 'https', 'ftp', 'sftp']):
raise ValueError(f'Resolution disallowed for: {uri}')
else:
return load_schema(uri)


def favor_app_specific_schema(schema: str) -> str:
Expand Down Expand Up @@ -82,13 +86,13 @@ def favor_app_specific_schema_ref(schema_ref: str) -> str:
def json_file_contains_element(json_filename: str, json_element: str) -> bool:
"""
If the given JSON file exists and contains the given JSON element name then
returns True, otherwise returnes False. The given JSON element may or may
returns True; otherwise returns False. The given JSON element may or may
not begin with a slash. Currently only looks at one single top-level element.
"""
if json_filename and json_element:
try:
with io.open(json_filename, "r") as json_f:
json_content = json.load(json_f)
json_content = json.load(json_f)
json_element = json_element.strip("/")
if json_element:
if json_content.get(json_element):
Expand All @@ -111,6 +115,64 @@ def json_file_contains_element(json_filename: str, json_element: str) -> bool:
return schema_ref


def resolve_merge_ref(ref, resolver):
with resolver.resolving(ref) as resolved:
if not isinstance(resolved, dict):
raise ValueError(
f'Schema ref {ref} must resolve dict, not {type(resolved)}'
)
return resolved


def _update_resolved_data(resolved_data, value, resolver):
# Assumes resolved value is dictionary.
resolved_data.update(
# Recurse here in case the resolved value has refs.
resolve_merge_refs(
# Actually get the ref value.
resolve_merge_ref(value, resolver),
resolver
)
)


def _handle_list_or_string_value(resolved_data, value, resolver):
if isinstance(value, list):
for v in value:
_update_resolved_data(resolved_data, v, resolver)
else:
_update_resolved_data(resolved_data, value, resolver)


def resolve_merge_refs(data, resolver):
if isinstance(data, dict):
# Return copy.
resolved_data = {}
for k, v in data.items():
if k == '$merge':
_handle_list_or_string_value(resolved_data, v, resolver)
else:
resolved_data[k] = resolve_merge_refs(v, resolver)
elif isinstance(data, list):
# Return copy.
resolved_data = [
resolve_merge_refs(v, resolver)
for v in data
]
else:
# Assumes we're only dealing with other JSON types
# like string, number, boolean, null, not other
# types like tuples, sets, functions, classes, etc.,
# which would require a deep copy.
resolved_data = data
return resolved_data


def fill_in_schema_merge_refs(schema, resolver):
""" Resolves $merge properties, custom $ref implementation from IGVF SNO2-6 """
return resolve_merge_refs(schema, resolver)


def mixinSchemas(schema, resolver, key_name='properties'):
mixinKeyName = 'mixin' + key_name.capitalize()
mixins = schema.get(mixinKeyName)
Expand Down Expand Up @@ -210,10 +272,6 @@ def linkTo(validator, linkTo, instance, schema):
yield ValidationError(error)
return

# And normalize the value to a uuid
if validator._serialize:
validator._validated[-1] = str(item.uuid)


class IgnoreUnchanged(ValidationError):
pass
Expand Down Expand Up @@ -287,8 +345,8 @@ def schema_is_array_of_objects(schema):
yield ValidationError('submission of calculatedProperty disallowed')


class SchemaValidator(Draft4Validator):
VALIDATORS = Draft4Validator.VALIDATORS.copy()
class SchemaValidator(SerializingSchemaValidator):
VALIDATORS = SerializingSchemaValidator.VALIDATORS.copy()
VALIDATORS['calculatedProperty'] = calculatedProperty
VALIDATORS['linkTo'] = linkTo
VALIDATORS['permission'] = permission
Expand Down Expand Up @@ -320,9 +378,10 @@ def load_schema(filename):
),
resolver, 'columns'
)
schema = fill_in_schema_merge_refs(schema, resolver)

# SchemaValidator is not thread safe for now
SchemaValidator(schema, resolver=resolver, serialize=True)
SchemaValidator(schema, resolver=resolver)
return schema


Expand All @@ -344,7 +403,7 @@ def validate(schema, data, current=None, validate_current=False):
dict validated contents, list of errors
"""
resolver = NoRemoteResolver.from_schema(schema)
sv = SchemaValidator(schema, resolver=resolver, serialize=True, format_checker=format_checker)
sv = SchemaValidator(schema, resolver=resolver, format_checker=format_checker)
validated, errors = sv.serialize(data)
# validate against current contents if validate_current is set
if current and validate_current:
Expand Down Expand Up @@ -381,6 +440,18 @@ def validate(schema, data, current=None, validate_current=False):
# Right now those other arms set seemingly-unused variables. -kmp 7-Aug-2022
if validated_value == current_value:
continue # value is unchanged between data/current; ignore
# Also ignore requestMethod and permission errors from defaults.
if isinstance(error, IgnoreUnchanged):
current_value = data
try:
for key in error.path:
# If it's in original data then either user passed it in
# or it's from PATCH object with unchanged data. If it's
# unchanged then it's already been skipped above.
current_value = current_value[key]
except KeyError:
# If it's not in original data then it's filled in by defaults.
continue
filtered_errors.append(error)

return validated, filtered_errors
Expand Down Expand Up @@ -452,7 +523,6 @@ def combine_schemas(a, b):

# for integrated tests
def utc_now_str():
# from jsonschema_serialize_fork date-time format requires a timezone
return datetime.utcnow().isoformat() + '+00:00'


Expand Down
Loading

0 comments on commit 9df20a9

Please sign in to comment.