Schema Update (#262)

* implement schema validator migration * port basic merge functionality * small changes to resolve broken tests * test in real schema * give beta * review changes, flake * one more flake fix * give real versoin * small changes, additional test
4dn-dcic · Aug 4, 2023 · 9df20a9 · 9df20a9
1 parent 89ad813
commit 9df20a9
Show file tree

Hide file tree

Showing 43 changed files with 1,980 additions and 756 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,18 +6,29 @@ snovault
 Change Log
 ----------
 
-9.1.1
+10.0.0
+======
+
+* Updates ``jsonschema`` version, removing dependency on ``jsonschema-serialize-fork`` and allowing
+  us to use ``$merge`` refs.
+  * Breaking Change: dependencies --> dependentRequired in schema
+  * Breaking Change: object serialization in schema no longer valid
+
+
+  9.1.1
 =====
 
 * Small fix for JWT Decode incompatible change
 
 9.1.0
 =====
+
 * Fix for MIME type ordering in renderers.py (differs between cgap and fourfront).
 
 
 9.0.0
 =====
+
 * Merge/unify ingestion and other code from cgap-portal and fourfront.
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicsnovault"
-version = "9.1.1"
+version = "10.0.0"
 description = "Storage support for 4DN Data Portals."
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"
@@ -46,7 +46,6 @@ dcicutils = "^7.5.0"
 future = ">=0.15.2,<1"
 html5lib = ">=1.1"  # experimental, should be OK now that we're not using moto server
 humanfriendly = "^1.44.9"
-jsonschema_serialize_fork = "^2.1.1"
 netaddr = ">=0.8.0,<1"
 passlib = "^1.7.4"
 pillow = "^9.5.0"
@@ -83,6 +82,7 @@ xlrd = "^1.0.0"
 "zope.deprecation" = "^4.4.0"
 "zope.interface" = ">=4.7.2,<6"
 "zope.sqlalchemy" = "1.6"
+jsonschema = "^4.18.4"
 
 [tool.poetry.dev-dependencies]
 botocore-stubs = ">=1.29.119"  # no particular version required, but this speeds up search

diff --git a/snovault/elasticsearch/create_mapping.py b/snovault/elasticsearch/create_mapping.py
@@ -44,6 +44,7 @@
     get_uuids_for_types,
     SCAN_PAGE_SIZE,
 )
+from ..schema_utils import load_schema
 from .interfaces import ELASTIC_SEARCH, INDEXER_QUEUE
 from ..settings import Settings
 
@@ -104,6 +105,8 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
     TODO: rename 'lower_case_sort' to 'lowercase' and adjust search code
     """
     ignored(top_level)  # TODO: maybe wants to be used below, but isn't yet?
+    if '$merge' in schema:
+        schema = load_schema(schema)
     type_ = schema['type']
 
     # Elasticsearch handles multiple values for a field
@@ -715,6 +718,8 @@ def type_mapping(types, item_type, embed=True):
     #       to relevant fields so that they are not mapped into full_text, for example.
     properties = schema['properties']
     for _, sub_mapping in properties.items():
+        if '$merge' in sub_mapping:
+            sub_mapping = load_schema(sub_mapping)
         if sub_mapping['type'] == 'text':
             sub_mapping['copy_to'] = ['full_text']
     return mapping

diff --git a/snovault/elasticsearch/indexer_utils.py b/snovault/elasticsearch/indexer_utils.py
@@ -414,8 +414,8 @@ def compute_invalidation_scope(context, request):
     # Note 'type' is desired here because concrete types have literal type TypeInfo
     # vs. abstract types have literal type AbstractTypeInfo
     # isinstance() will return True (wrong) since TypeInfo inherits from AbstractTypeInfo
-    if type(request.registry[TYPES][source_type]) == AbstractTypeInfo or \
-            type(request.registry[TYPES][target_type]) == AbstractTypeInfo:
+    if type(request.registry[TYPES][source_type]) is AbstractTypeInfo or \
+            type(request.registry[TYPES][target_type]) is AbstractTypeInfo:
         raise HTTPBadRequest('One or more of your types is abstract! %s/%s' % (source_type, target_type))
     source_type_schema = request.registry[TYPES][source_type].schema
     result = {

diff --git a/snovault/schema_formats.py b/snovault/schema_formats.py
@@ -1,6 +1,6 @@
 import re
 
-from jsonschema_serialize_fork import FormatChecker
+from jsonschema import FormatChecker
 from .server_defaults import (
     ACCESSION_PREFIX,
     ACCESSION_TEST_PREFIX,

diff --git a/snovault/schema_utils.py b/snovault/schema_utils.py
@@ -6,12 +6,10 @@
 
 from datetime import datetime
 from dcicutils.misc_utils import ignored
-from jsonschema_serialize_fork import (
-    Draft4Validator,
-    FormatChecker,
-    RefResolver,
-)
-from jsonschema_serialize_fork.exceptions import ValidationError
+from snovault.schema_validation import SerializingSchemaValidator
+from jsonschema import FormatChecker
+from jsonschema import RefResolver
+from jsonschema.exceptions import ValidationError
 import os
 from pyramid.path import AssetResolver, caller_package
 from pyramid.threadlocal import get_current_request
@@ -42,7 +40,13 @@ def server_default(func):
 
 class NoRemoteResolver(RefResolver):
     def resolve_remote(self, uri):
-        raise ValueError('Resolution disallowed for: %s' % uri)
+        """ Resolves remote uri for files so we can cross reference across our own
+            repos, which now contain base schemas we may want to use
+        """
+        if any(s in uri for s in ['http', 'https', 'ftp', 'sftp']):
+            raise ValueError(f'Resolution disallowed for: {uri}')
+        else:
+            return load_schema(uri)
 
 
 def favor_app_specific_schema(schema: str) -> str:
@@ -82,13 +86,13 @@ def favor_app_specific_schema_ref(schema_ref: str) -> str:
     def json_file_contains_element(json_filename: str, json_element: str) -> bool:
         """
         If the given JSON file exists and contains the given JSON element name then
-        returns True, otherwise returnes False. The given JSON element may or may 
+        returns True; otherwise returns False. The given JSON element may or may
         not begin with a slash. Currently only looks at one single top-level element.
         """
         if json_filename and json_element:
             try:
                 with io.open(json_filename, "r") as json_f:
-                    json_content = json.load(json_f) 
+                    json_content = json.load(json_f)
                     json_element = json_element.strip("/")
                     if json_element:
                         if json_content.get(json_element):
@@ -111,6 +115,64 @@ def json_file_contains_element(json_filename: str, json_element: str) -> bool:
     return schema_ref
 
 
+def resolve_merge_ref(ref, resolver):
+    with resolver.resolving(ref) as resolved:
+        if not isinstance(resolved, dict):
+            raise ValueError(
+                f'Schema ref {ref} must resolve dict, not {type(resolved)}'
+            )
+        return resolved
+
+
+def _update_resolved_data(resolved_data, value, resolver):
+    # Assumes resolved value is dictionary.
+    resolved_data.update(
+        # Recurse here in case the resolved value has refs.
+        resolve_merge_refs(
+            # Actually get the ref value.
+            resolve_merge_ref(value, resolver),
+            resolver
+        )
+    )
+
+
+def _handle_list_or_string_value(resolved_data, value, resolver):
+    if isinstance(value, list):
+        for v in value:
+            _update_resolved_data(resolved_data, v, resolver)
+    else:
+        _update_resolved_data(resolved_data, value, resolver)
+
+
+def resolve_merge_refs(data, resolver):
+    if isinstance(data, dict):
+        # Return copy.
+        resolved_data = {}
+        for k, v in data.items():
+            if k == '$merge':
+                _handle_list_or_string_value(resolved_data, v, resolver)
+            else:
+                resolved_data[k] = resolve_merge_refs(v, resolver)
+    elif isinstance(data, list):
+        # Return copy.
+        resolved_data = [
+            resolve_merge_refs(v, resolver)
+            for v in data
+        ]
+    else:
+        # Assumes we're only dealing with other JSON types
+        # like string, number, boolean, null, not other
+        # types like tuples, sets, functions, classes, etc.,
+        # which would require a deep copy.
+        resolved_data = data
+    return resolved_data
+
+
+def fill_in_schema_merge_refs(schema, resolver):
+    """ Resolves $merge properties, custom $ref implementation from IGVF SNO2-6 """
+    return resolve_merge_refs(schema, resolver)
+
+
 def mixinSchemas(schema, resolver, key_name='properties'):
     mixinKeyName = 'mixin' + key_name.capitalize()
     mixins = schema.get(mixinKeyName)
@@ -210,10 +272,6 @@ def linkTo(validator, linkTo, instance, schema):
                 yield ValidationError(error)
                 return
 
-    # And normalize the value to a uuid
-    if validator._serialize:
-        validator._validated[-1] = str(item.uuid)
-
 
 class IgnoreUnchanged(ValidationError):
     pass
@@ -287,8 +345,8 @@ def schema_is_array_of_objects(schema):
         yield ValidationError('submission of calculatedProperty disallowed')
 
 
-class SchemaValidator(Draft4Validator):
-    VALIDATORS = Draft4Validator.VALIDATORS.copy()
+class SchemaValidator(SerializingSchemaValidator):
+    VALIDATORS = SerializingSchemaValidator.VALIDATORS.copy()
     VALIDATORS['calculatedProperty'] = calculatedProperty
     VALIDATORS['linkTo'] = linkTo
     VALIDATORS['permission'] = permission
@@ -320,9 +378,10 @@ def load_schema(filename):
         ),
         resolver, 'columns'
     )
+    schema = fill_in_schema_merge_refs(schema, resolver)
 
     # SchemaValidator is not thread safe for now
-    SchemaValidator(schema, resolver=resolver, serialize=True)
+    SchemaValidator(schema, resolver=resolver)
     return schema
 
 
@@ -344,7 +403,7 @@ def validate(schema, data, current=None, validate_current=False):
         dict validated contents, list of errors
     """
     resolver = NoRemoteResolver.from_schema(schema)
-    sv = SchemaValidator(schema, resolver=resolver, serialize=True, format_checker=format_checker)
+    sv = SchemaValidator(schema, resolver=resolver, format_checker=format_checker)
     validated, errors = sv.serialize(data)
     # validate against current contents if validate_current is set
     if current and validate_current:
@@ -381,6 +440,18 @@ def validate(schema, data, current=None, validate_current=False):
                 #       Right now those other arms set seemingly-unused variables. -kmp 7-Aug-2022
                 if validated_value == current_value:
                     continue  # value is unchanged between data/current; ignore
+        # Also ignore requestMethod and permission errors from defaults.
+        if isinstance(error, IgnoreUnchanged):
+            current_value = data
+            try:
+                for key in error.path:
+                    # If it's in original data then either user passed it in
+                    # or it's from PATCH object with unchanged data. If it's
+                    # unchanged then it's already been skipped above.
+                    current_value = current_value[key]
+            except KeyError:
+                # If it's not in original data then it's filled in by defaults.
+                continue
         filtered_errors.append(error)
 
     return validated, filtered_errors
@@ -452,7 +523,6 @@ def combine_schemas(a, b):
 
 # for integrated tests
 def utc_now_str():
-    # from jsonschema_serialize_fork date-time format requires a timezone
     return datetime.utcnow().isoformat() + '+00:00'