diff --git a/.devcontainer/.dev_config.yaml b/.devcontainer/.dev_config.yaml index 5536f01..52a3d97 100644 --- a/.devcontainer/.dev_config.yaml +++ b/.devcontainer/.dev_config.yaml @@ -3,13 +3,19 @@ db_name: metadata-store searchable_classes: Dataset: description: Dataset grouping files under controlled access. - facetable_properties: - - key: type # a property directly part of the dataset + facetable_fields: + - key: type # a field directly part of the dataset name: Type - - key: "study.type" # a property that is part of study that is embedded into this dataset + - key: "study.type" # a field that is part of study that is embedded into this dataset name: Study Type - - key: "study.project.alias" # a property part of a deeply embedded resource + - key: "study.project.alias" # a field part of a deeply embedded resource name: Project Alias + selected_fields: + - key: accession + name: Dataset ID + - key: title + name: Title + resource_change_event_topic: searchable_resources resource_deletion_event_type: searchable_resource_deleted resource_upsertion_event_type: searchable_resource_upserted diff --git a/.devcontainer/dev_launcher b/.devcontainer/dev_launcher index b5bbc35..b4b22e1 100755 --- a/.devcontainer/dev_launcher +++ b/.devcontainer/dev_launcher @@ -1,3 +1,3 @@ #!/bin/bash -mass +mass run-rest diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 47fbf10..7a7afc4 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -1,18 +1,10 @@ -version: '3' - services: app: build: context: . dockerfile: ./Dockerfile args: - # [Choice] Python version: 3, 3.8, 3.7, 3.6 - VARIANT: 3.9 - # [Choice] Install Node.js - INSTALL_NODE: "true" - NODE_VERSION: "lts/*" - # Please adapt to package name: - PACKAGE_NAME: "mass" + PACKAGE_NAME: mass # On Linux, you may need to update USER_UID and USER_GID below if not your local UID is not 1000. USER_UID: 1000 USER_GID: 1000 @@ -33,18 +25,26 @@ services: environment: # Please adapt to package name: MASS_CONFIG_YAML: /workspace/.devcontainer/.dev_config.yaml - # Used by db migration: - DB_URL: postgresql://postgres:postgres@postgresql/postgres + # Use "forwardPorts" in **devcontainer.json** to forward an app port locally. # (Adding the "ports" property to this file will not forward from a Codespace.) - - # Please remove service dependencies that are not needed: mongodb: image: mongo:latest restart: unless-stopped volumes: - mongo_fs:/data/db + mongo-express: + image: mongo-express:latest + restart: unless-stopped + ports: + - 8088:8081 + environment: + ME_CONFIG_MONGODB_URL: mongodb://mongodb:27017/ + ME_CONFIG_BASICAUTH_USERNAME: dev + ME_CONFIG_BASICAUTH_PASSWORD: dev + ME_CONFIG_MONGODB_ENABLE_ADMIN: "true" + volumes: mongo_fs: {} diff --git a/.pyproject_generation/pyproject_custom.toml b/.pyproject_generation/pyproject_custom.toml index 85d0d20..3fad6dc 100644 --- a/.pyproject_generation/pyproject_custom.toml +++ b/.pyproject_generation/pyproject_custom.toml @@ -1,11 +1,11 @@ [project] name = "mass" -version = "2.1.0" -description = "Metadata Artifact Search Service - A service for searching metadata artifacts and filtering results." +version = "3.0.0" +description = "Metadata Artifact Search Service - A service for searching metadata artifacts and filtering results." dependencies = [ "typer>=0.12", - "ghga-service-commons[api]>=3.0.0", - "ghga-event-schemas>=2.0.0", + "ghga-service-commons[api]>=3.1.5", + "ghga-event-schemas>=3.1.1", "hexkit[mongodb,akafka]>=3.5.0", ] diff --git a/.readme_generation/description.md b/.readme_generation/description.md index 38f41f1..8cd1d98 100644 --- a/.readme_generation/description.md +++ b/.readme_generation/description.md @@ -1,9 +1,13 @@ The Metadata Artifact Search Service uses search parameters to look for metadata. ### Quick Overview of API -There are two available API endpoints that follow the RPC pattern (not REST): -One endpoint ("GET /rpc/search-options") will return an overview of all metadata classes that can be targeted -by a search. The actual search endpoint ("POST /rpc/search") can be used to search for these target classes using keywords. Hits will be reported in the context of the selected target class. +The API provides two not strictly RESTful endpoints: + +One endpoint ("GET /search-options") will return an overview of all metadata classes +that can be targeted by a search. + +The actual search endpoint ("GET /search") can be used to search for these target classes +using keywords. Hits will be reported in the context of the selected target class. This means that target classes will be reported that match the specified search query, however, the target class might contain embedded other classes and the match might occur in these embedded classes, too. diff --git a/.readme_generation/design.md b/.readme_generation/design.md index a0c7e34..1527591 100644 --- a/.readme_generation/design.md +++ b/.readme_generation/design.md @@ -5,14 +5,17 @@ It uses protocol/provider pairs and dependency injection mechanisms provided by This service is currently designed to work with MongoDB and uses an aggregation pipeline to produce search results. Typical sequence of events is as follows: + 1. Requests are received by the API, then directed to the QueryHandler in the core. -2. From there, the configuration is consulted to retrieve any facetable properties for the searched resource class. +2. From there, the configuration is consulted to retrieve any facetable and selected fields for the searched resource class. 3. The search parameters and facet fields are passed to the Aggregator, which builds and runs the aggregation pipeline on the appropriate collection. The aggregation pipeline is a series of stages run in sequence: - - The first stage runs a text match using the query string. - - The second stage applies a sort based on the IDs. - - The third stage applies any filters supplied in the search parameters. - - The fourth stage extract facets. - - The fifth/final stage transforms the results structure into {facets, hits, hit count}. -4. Once retrieved in the Aggregator, the results are passed back to the QueryHandler where they are shoved into a QueryResults pydantic model for validation before finally being sent back to the API. + 1. Run a text match using the query string. + 2. Apply a sort based on the IDs. + 3. Apply any filters supplied in the search parameters. + 4. Extract the facets. + 5. Keep only selected fields if some have been specified. + 6. Transform the results structure into {facets, hits, hit count}. + +4. Once retrieved in the Aggregator, the results are passed back to the QueryHandler where they are shoved into a QueryResults Pydantic model for validation before finally being sent back to the API. diff --git a/README.md b/README.md index 31db950..992f5dd 100644 --- a/README.md +++ b/README.md @@ -3,16 +3,20 @@ # Mass -Metadata Artifact Search Service - A service for searching metadata artifacts and filtering results. +Metadata Artifact Search Service - A service for searching metadata artifacts and filtering results. ## Description The Metadata Artifact Search Service uses search parameters to look for metadata. ### Quick Overview of API -There are two available API endpoints that follow the RPC pattern (not REST): -One endpoint ("GET /rpc/search-options") will return an overview of all metadata classes that can be targeted -by a search. The actual search endpoint ("POST /rpc/search") can be used to search for these target classes using keywords. Hits will be reported in the context of the selected target class. +The API provides two not strictly RESTful endpoints: + +One endpoint ("GET /search-options") will return an overview of all metadata classes +that can be targeted by a search. + +The actual search endpoint ("GET /search") can be used to search for these target classes +using keywords. Hits will be reported in the context of the selected target class. This means that target classes will be reported that match the specified search query, however, the target class might contain embedded other classes and the match might occur in these embedded classes, too. @@ -32,13 +36,13 @@ We recommend using the provided Docker container. A pre-build version is available at [docker hub](https://hub.docker.com/repository/docker/ghga/mass): ```bash -docker pull ghga/mass:2.1.0 +docker pull ghga/mass:3.0.0 ``` Or you can build the container yourself from the [`./Dockerfile`](./Dockerfile): ```bash # Execute in the repo's root dir: -docker build -t ghga/mass:2.1.0 . +docker build -t ghga/mass:3.0.0 . ``` For production-ready deployment, we recommend using Kubernetes, however, @@ -46,7 +50,7 @@ for simple use cases, you could execute the service using docker on a single server: ```bash # The entrypoint is preconfigured: -docker run -p 8080:8080 ghga/mass:2.1.0 --help +docker run -p 8080:8080 ghga/mass:3.0.0 --help ``` If you prefer not to use containers, you may install the service from source: @@ -100,7 +104,7 @@ The service requires the following configuration parameters: - **`log_traceback`** *(boolean)*: Whether to include exception tracebacks in log messages. Default: `true`. -- **`searchable_classes`** *(object)*: A collection of searchable_classes with facetable properties. Can contain additional properties. +- **`searchable_classes`** *(object)*: A collection of searchable_classes with facetable and selected fields. Can contain additional properties. - **Additional properties**: Refer to *[#/$defs/SearchableClass](#%24defs/SearchableClass)*. @@ -303,19 +307,23 @@ The service requires the following configuration parameters: ## Definitions -- **`FacetLabel`** *(object)*: Contains the key and corresponding user-friendly name for a facet. +- **`FieldLabel`** *(object)*: Contains the field name and corresponding user-friendly name. - - **`key`** *(string, required)*: The raw facet key, such as study.type. + - **`key`** *(string, required)*: The raw field name, such as study.type. - - **`name`** *(string)*: The user-friendly name for the facet. Default: `""`. + - **`name`** *(string)*: A user-friendly name for the field (leave empty to use the key). Default: `""`. - **`SearchableClass`** *(object)*: Represents a searchable artifact or resource type. - **`description`** *(string, required)*: A brief description of the resource type. - - **`facetable_properties`** *(array, required)*: A list of of the facetable properties for the resource type. + - **`facetable_fields`** *(array)*: A list of the facetable fields for the resource type (leave empty to not use faceting). Default: `[]`. + + - **Items**: Refer to *[#/$defs/FieldLabel](#%24defs/FieldLabel)*. - - **Items**: Refer to *[#/$defs/FacetLabel](#%24defs/FacetLabel)*. + - **`selected_fields`** *(array)*: A list of the returned fields for the resource type (leave empty to return all). Default: `[]`. + + - **Items**: Refer to *[#/$defs/FieldLabel](#%24defs/FieldLabel)*. ### Usage: @@ -353,17 +361,20 @@ It uses protocol/provider pairs and dependency injection mechanisms provided by This service is currently designed to work with MongoDB and uses an aggregation pipeline to produce search results. Typical sequence of events is as follows: + 1. Requests are received by the API, then directed to the QueryHandler in the core. -2. From there, the configuration is consulted to retrieve any facetable properties for the searched resource class. +2. From there, the configuration is consulted to retrieve any facetable and selected fields for the searched resource class. 3. The search parameters and facet fields are passed to the Aggregator, which builds and runs the aggregation pipeline on the appropriate collection. The aggregation pipeline is a series of stages run in sequence: - - The first stage runs a text match using the query string. - - The second stage applies a sort based on the IDs. - - The third stage applies any filters supplied in the search parameters. - - The fourth stage extract facets. - - The fifth/final stage transforms the results structure into {facets, hits, hit count}. -4. Once retrieved in the Aggregator, the results are passed back to the QueryHandler where they are shoved into a QueryResults pydantic model for validation before finally being sent back to the API. + 1. Run a text match using the query string. + 2. Apply a sort based on the IDs. + 3. Apply any filters supplied in the search parameters. + 4. Extract the facets. + 5. Keep only selected fields if some have been specified. + 6. Transform the results structure into {facets, hits, hit count}. + +4. Once retrieved in the Aggregator, the results are passed back to the QueryHandler where they are shoved into a QueryResults Pydantic model for validation before finally being sent back to the API. ## Development diff --git a/config_schema.json b/config_schema.json index 643fb10..ce6fbfc 100644 --- a/config_schema.json +++ b/config_schema.json @@ -1,16 +1,16 @@ { "$defs": { - "FacetLabel": { - "description": "Contains the key and corresponding user-friendly name for a facet", + "FieldLabel": { + "description": "Contains the field name and corresponding user-friendly name", "properties": { "key": { - "description": "The raw facet key, such as study.type", + "description": "The raw field name, such as study.type", "title": "Key", "type": "string" }, "name": { "default": "", - "description": "The user-friendly name for the facet", + "description": "A user-friendly name for the field (leave empty to use the key)", "title": "Name", "type": "string" } @@ -18,7 +18,7 @@ "required": [ "key" ], - "title": "FacetLabel", + "title": "FieldLabel", "type": "object" }, "SearchableClass": { @@ -29,18 +29,27 @@ "title": "Description", "type": "string" }, - "facetable_properties": { - "description": "A list of of the facetable properties for the resource type", + "facetable_fields": { + "default": [], + "description": "A list of the facetable fields for the resource type (leave empty to not use faceting)", "items": { - "$ref": "#/$defs/FacetLabel" + "$ref": "#/$defs/FieldLabel" }, - "title": "Facetable Properties", + "title": "Facetable Fields", + "type": "array" + }, + "selected_fields": { + "default": [], + "description": "A list of the returned fields for the resource type (leave empty to return all)", + "items": { + "$ref": "#/$defs/FieldLabel" + }, + "title": "Selected Fields", "type": "array" } }, "required": [ - "description", - "facetable_properties" + "description" ], "title": "SearchableClass", "type": "object" @@ -103,7 +112,7 @@ "additionalProperties": { "$ref": "#/$defs/SearchableClass" }, - "description": "A collection of searchable_classes with facetable properties", + "description": "A collection of searchable_classes with facetable and selected fields", "title": "Searchable Classes", "type": "object" }, diff --git a/example_config.yaml b/example_config.yaml index 2da9642..dba80ba 100644 --- a/example_config.yaml +++ b/example_config.yaml @@ -28,13 +28,18 @@ resource_upsertion_event_type: searchable_resource_upserted searchable_classes: Dataset: description: Dataset grouping files under controlled access. - facetable_properties: + facetable_fields: - key: type name: Type - key: study.type name: Study Type - key: study.project.alias name: Project Alias + selected_fields: + - key: accession + name: Dataset ID + - key: title + name: Title service_instance_id: '001' service_name: mass workers: 1 diff --git a/openapi.yaml b/openapi.yaml index 2266866..dd29c48 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -5,12 +5,13 @@ components: the facet properties: key: - description: The raw facet key, such as study.type + description: The raw field name, such as study.type title: Key type: string name: default: '' - description: The user-friendly name for the facet + description: A user-friendly name for the field (leave empty to use the + key) title: Name type: string options: @@ -24,22 +25,6 @@ components: - options title: Facet type: object - FacetLabel: - description: Contains the key and corresponding user-friendly name for a facet - properties: - key: - description: The raw facet key, such as study.type - title: Key - type: string - name: - default: '' - description: The user-friendly name for the facet - title: Name - type: string - required: - - key - title: FacetLabel - type: object FacetOption: description: Represents the format for an option for a facet properties: @@ -56,21 +41,22 @@ components: - count title: FacetOption type: object - Filter: - description: Represents a filter used to refine results + FieldLabel: + description: Contains the field name and corresponding user-friendly name properties: key: - description: The field to filter + description: The raw field name, such as study.type title: Key type: string - value: - description: The value the field must match - title: Value + name: + default: '' + description: A user-friendly name for the field (leave empty to use the + key) + title: Name type: string required: - key - - value - title: Filter + title: FieldLabel type: object HTTPValidationError: properties: @@ -132,47 +118,6 @@ components: - content title: Resource type: object - SearchParameters: - description: Represents the data submitted in a search query - properties: - class_name: - description: The name of the resource class, e.g. Dataset - title: Class Name - type: string - filters: - default: [] - description: The filters to apply to the search - items: - $ref: '#/components/schemas/Filter' - title: Filters - type: array - limit: - anyOf: - - type: integer - - type: 'null' - description: Limit the results to this number - title: Limit - query: - default: '' - description: The keyword search for the query - title: Query - type: string - skip: - default: 0 - description: The number of results to skip for pagination - title: Skip - type: integer - sorting_parameters: - default: [] - description: Collection of sorting parameters used to refine search results - items: - $ref: '#/components/schemas/SortingParameter' - title: Sorting Parameters - type: array - required: - - class_name - title: SearchParameters - type: object SearchableClass: description: Represents a searchable artifact or resource type properties: @@ -180,15 +125,24 @@ components: description: A brief description of the resource type title: Description type: string - facetable_properties: - description: A list of of the facetable properties for the resource type + facetable_fields: + default: [] + description: A list of the facetable fields for the resource type (leave + empty to not use faceting) + items: + $ref: '#/components/schemas/FieldLabel' + title: Facetable Fields + type: array + selected_fields: + default: [] + description: A list of the returned fields for the resource type (leave + empty to return all) items: - $ref: '#/components/schemas/FacetLabel' - title: Facetable Properties + $ref: '#/components/schemas/FieldLabel' + title: Selected Fields type: array required: - description - - facetable_properties title: SearchableClass type: object SortOrder: @@ -199,22 +153,6 @@ components: - relevance title: SortOrder type: string - SortingParameter: - description: Represents a combination of a field to sort and the sort order - properties: - field: - description: Which field to sort results by. - title: Field - type: string - order: - allOf: - - $ref: '#/components/schemas/SortOrder' - default: ascending - description: Sort order to apply to sort_field - required: - - field - title: SortingParameter - type: object ValidationError: properties: loc: @@ -237,8 +175,13 @@ components: title: ValidationError type: object info: - title: FastAPI - version: 0.1.0 + contact: + email: contact@ghga.de + license: + name: Apache 2.0 + summary: A service for searching metadata artifacts and filtering results. + title: Metadata Artifact Search Service + version: 3.0.0 openapi: 3.1.0 paths: /health: @@ -252,16 +195,95 @@ paths: schema: {} description: Successful Response summary: health - /rpc/search: - post: + /search: + get: description: Perform search query - operationId: search_rpc_search_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/SearchParameters' + operationId: search_search_get + parameters: + - description: The class name to search + in: query + name: class_name required: true + schema: + description: The class name to search + title: Class Name + type: string + - description: The keyword search for the query + in: query + name: query + required: false + schema: + default: '' + description: The keyword search for the query + title: Query + type: string + - description: Field(s) that shall be used for filtering results + in: query + name: filter_by + required: false + schema: + anyOf: + - items: + type: string + type: array + - type: 'null' + description: Field(s) that shall be used for filtering results + title: Filter By + - description: Values(s) that shall be used for filtering results + in: query + name: value + required: false + schema: + anyOf: + - items: + type: string + type: array + - type: 'null' + description: Values(s) that shall be used for filtering results + title: Value + - description: The number of results to skip for pagination + in: query + name: skip + required: false + schema: + default: 0 + description: The number of results to skip for pagination + title: Skip + type: integer + - description: Limit the results to this number + in: query + name: limit + required: false + schema: + anyOf: + - type: integer + - type: 'null' + description: Limit the results to this number + title: Limit + - description: Field(s) that shall be used for sorting results + in: query + name: order_by + required: false + schema: + anyOf: + - items: + type: string + type: array + - type: 'null' + description: Field(s) that shall be used for sorting results + title: Order By + - description: Sort order(s) that shall be used when sorting results + in: query + name: sort + required: false + schema: + anyOf: + - items: + $ref: '#/components/schemas/SortOrder' + type: array + - type: 'null' + description: Sort order(s) that shall be used when sorting results + title: Sort responses: '200': content: @@ -276,22 +298,26 @@ paths: $ref: '#/components/schemas/HTTPValidationError' description: Validation Error summary: Perform a search using query string and filter parameters - /rpc/search-options: + /search-options: get: - description: 'Returns the configured searchable classes. This describes which - resource classes + description: 'Return the configured searchable classes. + + + The returned object describes which resource classes are accounted for in + the system, + + as well as their facetable and selected fields. - are accounted for in the system, as well as their facetable properties. The - facetable + The facetable fields represent specific data fields that will be aggregated + alongside - properties represent specific data properties that will be aggregated alongside - the + the search hits for further search refinement. - search hits for further search refinement. They contain a key, which is used - by the + The selected fields are those that will appear in the search results. - system, and a name, which is more user-friendly.' - operationId: search_options_rpc_search_options_get + They contain a key, which is used by the system, and a name, which is more + user-friendly.' + operationId: search_options_search_options_get responses: '200': content: @@ -299,7 +325,8 @@ paths: schema: additionalProperties: $ref: '#/components/schemas/SearchableClass' - title: Response Search Options Rpc Search Options Get + title: Response Search Options Search Options Get type: object description: Successful Response - summary: Retrieve all configured resource classes and facetable properties + summary: Retrieve all configured resource classes with their facetable and selected + fields diff --git a/pyproject.toml b/pyproject.toml index 83d5cde..b3a565f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,12 +21,12 @@ classifiers = [ "Intended Audience :: Developers", ] name = "mass" -version = "2.1.0" -description = "Metadata Artifact Search Service - A service for searching metadata artifacts and filtering results." +version = "3.0.0" +description = "Metadata Artifact Search Service - A service for searching metadata artifacts and filtering results." dependencies = [ "typer>=0.12", - "ghga-service-commons[api]>=3.0.0", - "ghga-event-schemas>=2.0.0", + "ghga-service-commons[api]>=3.1.5", + "ghga-event-schemas>=3.1.1", "hexkit[mongodb,akafka]>=3.5.0", ] diff --git a/scripts/script_utils/fastapi_app_location.py b/scripts/script_utils/fastapi_app_location.py index bd63ec6..6431c63 100644 --- a/scripts/script_utils/fastapi_app_location.py +++ b/scripts/script_utils/fastapi_app_location.py @@ -16,9 +16,7 @@ """Used to define the location of the main FastAPI app object.""" -from fastapi import FastAPI +from mass.adapters.inbound.fastapi_.configure import Config, get_configured_app -from mass.adapters.inbound.fastapi_.routes import router - -app = FastAPI() -app.include_router(router) +config = Config() # type: ignore +app = get_configured_app(config=config) diff --git a/src/mass/__init__.py b/src/mass/__init__.py index 87c9ad2..3d6b4b8 100644 --- a/src/mass/__init__.py +++ b/src/mass/__init__.py @@ -15,6 +15,9 @@ """A service for searching metadata artifacts and filtering results.""" -from importlib.metadata import version +from importlib.metadata import distribution -__version__ = version(__package__) +dist = distribution(__package__) +metadata = dist.metadata + +__version__ = dist.version diff --git a/src/mass/adapters/inbound/fastapi_/configure.py b/src/mass/adapters/inbound/fastapi_/configure.py index 3596824..c6edbe9 100644 --- a/src/mass/adapters/inbound/fastapi_/configure.py +++ b/src/mass/adapters/inbound/fastapi_/configure.py @@ -18,13 +18,34 @@ from fastapi import FastAPI from ghga_service_commons.api import configure_app +from mass import metadata from mass.adapters.inbound.fastapi_.routes import router from mass.config import Config def get_configured_app(*, config: Config) -> FastAPI: """Create and configure a REST API application.""" - app = FastAPI() + summary = metadata["Summary"] + author = metadata["Author"] + email = metadata["Author-email"] + license = metadata["License"] + title, summary = summary.split(" - ", 1) + contact = { + "name": author, + "email": email, + } + license_info = { + "name": license, + } + version = metadata["Version"] + + app = FastAPI( + contact=contact, + license_info=license_info, + summary=summary, + title=title, + version=version, + ) app.include_router(router) configure_app(app, config=config) diff --git a/src/mass/adapters/inbound/fastapi_/models.py b/src/mass/adapters/inbound/fastapi_/models.py deleted file mode 100644 index fd032f2..0000000 --- a/src/mass/adapters/inbound/fastapi_/models.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln -# for the German Human Genome-Phenome Archive (GHGA) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Models only used by the API""" - -from pydantic import BaseModel, Field, field_validator - -from mass.core.models import Filter, SortingParameter - - -class SearchParameters(BaseModel): - """Represents the data submitted in a search query""" - - class_name: str = Field( - ..., description="The name of the resource class, e.g. Dataset" - ) - query: str = Field(default="", description="The keyword search for the query") - filters: list[Filter] = Field( - default=[], description="The filters to apply to the search" - ) - skip: int = Field( - default=0, description="The number of results to skip for pagination" - ) - limit: int | None = Field( - default=None, description="Limit the results to this number" - ) - sorting_parameters: list[SortingParameter] = Field( - default=[], - description=("Collection of sorting parameters used to refine search results"), - ) - - @field_validator("sorting_parameters") - @classmethod - def no_duplicate_fields( - cls, parameters: list[SortingParameter] - ) -> list[SortingParameter]: - """Check for duplicate fields in sorting parameters""" - all_sort_fields = [param.field for param in parameters] - if len(set(all_sort_fields)) < len(all_sort_fields): - raise ValueError("Sorting parameters cannot contain duplicate fields") - return parameters diff --git a/src/mass/adapters/inbound/fastapi_/routes.py b/src/mass/adapters/inbound/fastapi_/routes.py index 9ee78ea..2b4de6e 100644 --- a/src/mass/adapters/inbound/fastapi_/routes.py +++ b/src/mass/adapters/inbound/fastapi_/routes.py @@ -16,10 +16,11 @@ """API endpoints""" -from fastapi import APIRouter, status +from typing import Annotated + +from fastapi import APIRouter, Query, status from fastapi.exceptions import HTTPException -from mass.adapters.inbound.fastapi_ import models as api_models from mass.adapters.inbound.fastapi_.dummies import ConfigDummy, QueryHandlerDummy from mass.core import models @@ -37,46 +38,93 @@ async def health(): @router.get( - path="/rpc/search-options", - summary="Retrieve all configured resource classes and facetable properties", + path="/search-options", + summary="Retrieve all configured resource classes with their facetable and selected fields", response_model=dict[str, models.SearchableClass], ) async def search_options( config: ConfigDummy, ) -> dict[str, models.SearchableClass]: - """Returns the configured searchable classes. This describes which resource classes - are accounted for in the system, as well as their facetable properties. The facetable - properties represent specific data properties that will be aggregated alongside the - search hits for further search refinement. They contain a key, which is used by the - system, and a name, which is more user-friendly. + """Return the configured searchable classes. + + The returned object describes which resource classes are accounted for in the system, + as well as their facetable and selected fields. + The facetable fields represent specific data fields that will be aggregated alongside + the search hits for further search refinement. + The selected fields are those that will appear in the search results. + They contain a key, which is used by the system, and a name, which is more user-friendly. """ return config.searchable_classes -@router.post( - path="/rpc/search", +@router.get( + path="/search", summary="Perform a search using query string and filter parameters", response_model=models.QueryResults, ) -async def search( - parameters: api_models.SearchParameters, +async def search( # noqa: PLR0913 query_handler: QueryHandlerDummy, + class_name: Annotated[str, Query(description="The class name to search")], + query: Annotated[str, Query(description="The keyword search for the query")] = "", + filter_by: Annotated[ + list[str] | None, + Query(description="Field(s) that shall be used for filtering results"), + ] = None, + value: Annotated[ + list[str] | None, + Query(description="Values(s) that shall be used for filtering results"), + ] = None, + skip: Annotated[ + int, Query(description="The number of results to skip for pagination") + ] = 0, + limit: Annotated[ + int | None, Query(description="Limit the results to this number") + ] = None, + order_by: Annotated[ + list[str] | None, + Query(description="Field(s) that shall be used for sorting results"), + ] = None, + sort: Annotated[ + list[models.SortOrder] | None, + Query(description="Sort order(s) that shall be used when sorting results"), + ] = None, ) -> models.QueryResults | None: """Perform search query""" + if not class_name: + raise HTTPException(status_code=422, detail="A class name must be specified") + try: + filters = [ + models.Filter(key=field, value=value) + for field, value in zip(filter_by or [], value or [], strict=True) + ] + except ValueError as err: + detail = "Number of fields to filter by must match number of values" + raise HTTPException(status_code=422, detail=detail) from err + if order_by and len(set(order_by)) < len(order_by): + detail = "Fields to order by must be unique" + raise HTTPException(status_code=422, detail=detail) + try: + sorting_parameters = [ + models.SortingParameter(field=field, order=order) + for field, order in zip(order_by or [], sort or [], strict=True) + ] + except ValueError as err: + detail = "Number of fields to order by must match number of sort options" + raise HTTPException(status_code=422, detail=detail) from err try: results = await query_handler.handle_query( - class_name=parameters.class_name, - query=parameters.query, - filters=parameters.filters, - skip=parameters.skip, - limit=parameters.limit, - sorting_parameters=parameters.sorting_parameters, + class_name=class_name, + query=query, + filters=filters, + skip=skip, + limit=limit, + sorting_parameters=sorting_parameters, ) except query_handler.ClassNotConfiguredError as err: raise HTTPException( status_code=422, detail="The specified class name is invalid." - + " See /rpc/search-options for a list of valid class names.", + + " See /search-options for a list of valid class names.", ) from err except (query_handler.SearchError, query_handler.ValidationError) as err: raise HTTPException( diff --git a/src/mass/adapters/outbound/aggregator.py b/src/mass/adapters/outbound/aggregator.py index a98f399..323671c 100644 --- a/src/mass/adapters/outbound/aggregator.py +++ b/src/mass/adapters/outbound/aggregator.py @@ -42,7 +42,8 @@ async def aggregate( # noqa: PLR0913, D102 *, query: str, filters: list[models.Filter], - facet_fields: list[models.FacetLabel], + facet_fields: list[models.FieldLabel], + selected_fields: list[models.FieldLabel], skip: int = 0, limit: int | None = None, sorting_parameters: list[models.SortingParameter], @@ -56,6 +57,7 @@ async def aggregate( # noqa: PLR0913, D102 query=query, filters=filters, facet_fields=facet_fields, + selected_fields=selected_fields, skip=skip, limit=limit, sorting_parameters=sorting_parameters, diff --git a/src/mass/adapters/outbound/utils.py b/src/mass/adapters/outbound/utils.py index a4baeed..3bede27 100644 --- a/src/mass/adapters/outbound/utils.py +++ b/src/mass/adapters/outbound/utils.py @@ -16,7 +16,7 @@ """Utility functions for building the aggregation pipeline used by query handler""" -from collections import OrderedDict, defaultdict +from collections import defaultdict from typing import Any from hexkit.custom_types import JsonObject @@ -61,12 +61,13 @@ def pipeline_match_filters_stage(*, filters: list[models.Filter]) -> JsonObject: def pipeline_facet_sort_and_paginate( *, - facet_fields: list[models.FacetLabel], - skip: int, + facet_fields: list[models.FieldLabel], + skip: int = 0, limit: int | None = None, - sorts: OrderedDict, -): - """Uses a list of facetable property names to build the subquery for faceting""" + project: dict[str, Any] | None = None, + sort: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Uses a list of facetable fields to build the subquery for faceting""" segment: dict[str, list[JsonObject]] = {} for facet in facet_fields: @@ -88,24 +89,27 @@ def pipeline_facet_sort_and_paginate( # this is the total number of hits, but pagination can mean only a few are returned segment["count"] = [{"$count": "total"}] - # sort by ID, then rename the ID field to id_ to match our model - segment["hits"] = [ - {"$addFields": {"id_": "$_id"}}, - {"$unset": "_id"}, - {"$sort": sorts}, - ] + # rename the ID field to id_ to match our model + segment["hits"] = [{"$addFields": {"id_": "$_id"}}, {"$unset": "_id"}] + + # apply sorting parameters (maybe some of them are unselected fields) + if sort: + segment["hits"].append({"$sort": sort}) + + # pick only the selected fields + if project: + segment["hits"].append({"$project": project}) # apply skip and limit for pagination if skip > 0: segment["hits"].append({"$skip": skip}) - if limit: segment["hits"].append({"$limit": limit}) return {"$facet": segment} -def pipeline_project(*, facet_fields: list[models.FacetLabel]) -> JsonObject: +def pipeline_project(*, facet_fields: list[models.FieldLabel]) -> JsonObject: """Reshape the query so the facets are contained in a top level object""" segment: dict[str, Any] = {"hits": 1, "facets": []} segment["count"] = {"$arrayElemAt": ["$count.total", 0]} @@ -122,7 +126,8 @@ def build_pipeline( # noqa: PLR0913 *, query: str, filters: list[models.Filter], - facet_fields: list[models.FacetLabel], + facet_fields: list[models.FieldLabel], + selected_fields: list[models.FieldLabel], skip: int = 0, limit: int | None = None, sorting_parameters: list[models.SortingParameter], @@ -139,11 +144,22 @@ def build_pipeline( # noqa: PLR0913 if filters: pipeline.append(pipeline_match_filters_stage(filters=filters)) + # turn the selected fields into a formatted pipeline $project + project: dict[str, int] = dict.fromkeys( + [ + field.key if field.key == "id_" else f"content.{field.key}" + for field in selected_fields + ], + 1, + ) + # turn the sorting parameters into a formatted pipeline $sort - sorts = OrderedDict() - for param in sorting_parameters: - sort_order = SORT_ORDER_CONVERSION[param.order.value] - sorts[param.field] = sort_order + sort: dict[str, Any] = { + param.field + if param.field == "id_" + else f"content.{param.field}": SORT_ORDER_CONVERSION[param.order.value] + for param in sorting_parameters + } # define facets from preliminary results and reshape data pipeline.append( @@ -151,7 +167,8 @@ def build_pipeline( # noqa: PLR0913 facet_fields=facet_fields, skip=skip, limit=limit, - sorts=sorts, + project=project, + sort=sort, ) ) diff --git a/src/mass/config.py b/src/mass/config.py index 840952a..ffefdc9 100644 --- a/src/mass/config.py +++ b/src/mass/config.py @@ -31,7 +31,8 @@ class SearchableClassesConfig(BaseSettings): """Provides configuration validation for the searchable_classes""" searchable_classes: dict[str, SearchableClass] = Field( - ..., description="A collection of searchable_classes with facetable properties" + ..., + description="A collection of searchable_classes with facetable and selected fields", ) diff --git a/src/mass/core/models.py b/src/mass/core/models.py index 8014afe..5a64d83 100644 --- a/src/mass/core/models.py +++ b/src/mass/core/models.py @@ -21,11 +21,14 @@ from pydantic import BaseModel, Field -class FacetLabel(BaseModel): - """Contains the key and corresponding user-friendly name for a facet""" +class FieldLabel(BaseModel): + """Contains the field name and corresponding user-friendly name""" - key: str = Field(..., description="The raw facet key, such as study.type") - name: str = Field(default="", description="The user-friendly name for the facet") + key: str = Field(..., description="The raw field name, such as study.type") + name: str = Field( + default="", + description="A user-friendly name for the field (leave empty to use the key)", + ) class FacetOption(BaseModel): @@ -35,7 +38,7 @@ class FacetOption(BaseModel): count: int = Field(..., description="The number of results matching the facet") -class Facet(FacetLabel): +class Facet(FieldLabel): """Represents a facet's key, name, and the discovered options for the facet""" options: list[FacetOption] = Field( @@ -49,8 +52,15 @@ class SearchableClass(BaseModel): description: str = Field( ..., description="A brief description of the resource type" ) - facetable_properties: list[FacetLabel] = Field( - ..., description="A list of of the facetable properties for the resource type" + facetable_fields: list[FieldLabel] = Field( + [], + description="A list of the facetable fields for the resource type" + " (leave empty to not use faceting)", + ) + selected_fields: list[FieldLabel] = Field( + [], + description="A list of the returned fields for the resource type" + " (leave empty to return all)", ) diff --git a/src/mass/core/query_handler.py b/src/mass/core/query_handler.py index 03b09ac..0c30655 100644 --- a/src/mass/core/query_handler.py +++ b/src/mass/core/query_handler.py @@ -68,7 +68,7 @@ async def delete_resource(self, *, resource_id: str, class_name: str): # noqa: except ResourceNotFoundError as err: raise self.ResourceNotFoundError(resource_id=resource_id) from err - async def handle_query( # noqa: PLR0913, D102 + async def handle_query( # noqa: PLR0913, C901, D102 self, *, class_name: str, @@ -90,19 +90,23 @@ async def handle_query( # noqa: PLR0913, D102 sorting_parameters = [] # if id_ is not in sorting_parameters, add to end - if "id_" not in [param.field for param in sorting_parameters]: + if not any(param.field == "id_" for param in sorting_parameters): sorting_parameters.append( models.SortingParameter(field="id_", order=models.SortOrder.ASCENDING) ) - # get configured facet fields for given resource class + # get configured facet and selected fields for given resource class try: - facet_fields: list[models.FacetLabel] = self._config.searchable_classes[ - class_name - ].facetable_properties + searchable_class = self._config.searchable_classes[class_name] + facet_fields: list[models.FieldLabel] = searchable_class.facetable_fields + selected_fields: list[models.FieldLabel] = searchable_class.selected_fields except KeyError as err: raise self.ClassNotConfiguredError(class_name=class_name) from err + # if id_ is not in selected_fields, add as first field + if selected_fields and not any(field.key == "id_" for field in selected_fields): + selected_fields.insert(0, models.FieldLabel(key="id_", name="ID")) + # run the aggregation. Results will have {facets, count, hits} format aggregator = self._aggregator_collection.get_aggregator(class_name=class_name) for attempt in range(2): @@ -111,6 +115,7 @@ async def handle_query( # noqa: PLR0913, D102 query=query, filters=filters, facet_fields=facet_fields, + selected_fields=selected_fields, skip=skip, limit=limit, sorting_parameters=sorting_parameters, diff --git a/src/mass/ports/outbound/aggregator.py b/src/mass/ports/outbound/aggregator.py index 58c6637..6713c7d 100644 --- a/src/mass/ports/outbound/aggregator.py +++ b/src/mass/ports/outbound/aggregator.py @@ -39,7 +39,8 @@ async def aggregate( # noqa: PLR0913 *, query: str, filters: list[models.Filter], - facet_fields: list[models.FacetLabel], + facet_fields: list[models.FieldLabel], + selected_fields: list[models.FieldLabel], skip: int = 0, limit: int | None = None, sorting_parameters: list[models.SortingParameter], diff --git a/tests/fixtures/joint.py b/tests/fixtures/joint.py index 3b502cb..7c19884 100644 --- a/tests/fixtures/joint.py +++ b/tests/fixtures/joint.py @@ -18,12 +18,12 @@ import glob import re -from collections.abc import AsyncGenerator +from collections.abc import AsyncGenerator, Mapping from dataclasses import dataclass +from typing import TypeAlias import pytest_asyncio from ghga_service_commons.api.testing import AsyncTestClient -from hexkit.custom_types import JsonObject from hexkit.providers.akafka import KafkaEventSubscriber from hexkit.providers.akafka.testutils import KafkaFixture from hexkit.providers.mongodb.testutils import MongoDbFixture @@ -35,6 +35,8 @@ from tests.fixtures.config import get_config from tests.fixtures.utils import get_resources_from_file +QueryParams: TypeAlias = Mapping[str, int | str | list[str]] + @dataclass class JointFixture: @@ -46,6 +48,7 @@ class JointFixture: kafka: KafkaFixture mongodb: MongoDbFixture rest_client: AsyncTestClient + resources: dict[str, list[models.Resource]] def remove_db_data(self) -> None: """Delete everything in the database to start from a clean slate""" @@ -60,20 +63,20 @@ async def load_test_data(self) -> None: if match_obj: collection_name = match_obj.group(1) resources = get_resources_from_file(filename) + self.resources[collection_name] = resources for resource in resources: await self.query_handler.load_resource( resource=resource, class_name=collection_name ) - async def call_search_endpoint( - self, search_parameters: JsonObject - ) -> models.QueryResults: - """Convenience function to call the /rpc/search endpoint""" - response = await self.rest_client.post( - url="/rpc/search", json=search_parameters - ) + async def call_search_endpoint(self, params: QueryParams) -> models.QueryResults: + """Convenience function to call the /search endpoint""" + response = await self.rest_client.get(url="/search", params=params) + result = response.json() + assert result is not None, result + assert "detail" in result or "hits" in result, result response.raise_for_status() - return models.QueryResults(**response.json()) + return models.QueryResults(**result) @pytest_asyncio.fixture @@ -99,6 +102,7 @@ async def joint_fixture( kafka=kafka, mongodb=mongodb, rest_client=rest_client, + resources={}, ) await joint_fixture.load_test_data() yield joint_fixture diff --git a/tests/fixtures/test_config.yaml b/tests/fixtures/test_config.yaml index 9660978..ab8fc53 100644 --- a/tests/fixtures/test_config.yaml +++ b/tests/fixtures/test_config.yaml @@ -19,30 +19,40 @@ db_name: metadata-store searchable_classes: DatasetEmbedded: description: Dataset with embedded references. - facetable_properties: + facetable_fields: - key: category name: Category - key: field1 name: Field 1 - key: "has_object.type" name: Object Type + selected_fields: + - key: id_ + name: ID + - key: type + name: Location Type + - key: "has_object.type" + name: Object Type EmptyCollection: description: An empty collection to test the index creation. - facetable_properties: + facetable_fields: - key: fun_fact name: Fun Fact + selected_fields: [] SortingTests: description: Data for testing sorting functionality. - facetable_properties: + facetable_fields: - key: field name: Field + selected_fields: [] RelevanceTests: description: Data for testing sorting by relevance. - facetable_properties: + facetable_fields: - key: field name: Field - key: data name: Data + selected_fields: [] resource_change_event_topic: searchable_resources resource_deletion_event_type: searchable_resource_deleted resource_upsertion_event_type: searchable_resource_upserted diff --git a/tests/fixtures/test_data/SortingTests.json b/tests/fixtures/test_data/SortingTests.json index 4206851..538a4a1 100644 --- a/tests/fixtures/test_data/SortingTests.json +++ b/tests/fixtures/test_data/SortingTests.json @@ -1,27 +1,27 @@ { "items": [ { - "field": "some data", + "field": "alpha", "id_": "i2" }, { - "field": "some data", + "field": "bravo", "id_": "i1" }, { - "field": "some data", + "field": "charlie", "id_": "i3" }, { - "field": "some data", + "field": "delta", "id_": "i5" }, { - "field": "some data", + "field": "echo", "id_": "i6" }, { - "field": "some data", + "field": "foxtrot", "id_": "i4" } ] diff --git a/tests/test_api.py b/tests/test_api.py index 63eeff3..98c532f 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -20,12 +20,11 @@ import httpx import pytest -from hexkit.custom_types import JsonObject from pymongo import MongoClient from mass.core import models from tests.fixtures.config import get_config -from tests.fixtures.joint import JointFixture +from tests.fixtures.joint import JointFixture, QueryParams pytestmark = pytest.mark.asyncio() @@ -42,17 +41,15 @@ def compare( assert results.count == count assert len(results.hits) == hit_length - if not facets: + if facets: + assert results.facets == facets + else: config = get_config() - dataset_embedded_class = config.searchable_classes["DatasetEmbedded"] assert dataset_embedded_class is not None - - configured_facets = dataset_embedded_class.facetable_properties + configured_facets = dataset_embedded_class.facetable_fields assert len(results.facets) == len(configured_facets) assert {x.key for x in results.facets} == {x.key for x in configured_facets} - else: - assert results.facets == facets if hits: assert results.hits == hits @@ -68,7 +65,7 @@ async def test_health_check(joint_fixture: JointFixture): async def test_search_options(joint_fixture: JointFixture): """Verify that we can request the configured resource class information correctly""" - response = await joint_fixture.rest_client.get(url="/rpc/search-options") + response = await joint_fixture.rest_client.get(url="/search-options") assert response.json() == joint_fixture.config.model_dump()["searchable_classes"] @@ -92,18 +89,13 @@ async def test_malformed_document( await joint_fixture.query_handler.load_resource( resource=resource, class_name="DatasetEmbedded" ) - search_parameters: JsonObject = { - "class_name": "DatasetEmbedded", - "query": "", - "filters": [], - "skip": 0, - } + params: QueryParams = {"class_name": "DatasetEmbedded"} with caplog.at_level(logging.WARNING): with pytest.raises( httpx.HTTPStatusError, match="500 Internal Server Error" ) as exc_info: - await joint_fixture.call_search_endpoint(search_parameters) + await joint_fixture.call_search_endpoint(params) assert ( exc_info.value.response.json().get("detail") == "An error occurred during the search operation" @@ -116,39 +108,22 @@ async def test_malformed_document( async def test_search(joint_fixture: JointFixture): """Basic query to pull back all documents for class name""" - search_parameters: JsonObject = { - "class_name": "DatasetEmbedded", - "query": "", - "filters": [], - "skip": 0, - } + params: QueryParams = {"class_name": "DatasetEmbedded"} - results = await joint_fixture.call_search_endpoint(search_parameters) + results = await joint_fixture.call_search_endpoint(params) compare(results=results, count=3, hit_length=3) async def test_search_with_limit(joint_fixture: JointFixture): """Make sure we get a count of 3 but only 1 hit""" - search_parameters: JsonObject = { - "class_name": "DatasetEmbedded", - "query": "", - "filters": [], - "skip": 0, - "limit": 1, - } + params: QueryParams = {"class_name": "DatasetEmbedded", "limit": 1} - results = await joint_fixture.call_search_endpoint(search_parameters) + results = await joint_fixture.call_search_endpoint(params) hit = { "id_": "1HotelAlpha-id", "content": { - "category": "hotel", - "field1": "Miami", - "has_object": {"id_": "HotelAlphaObject", "type": "piano"}, - "has_rooms": [ - {"id_": "HotelAlphaLarge", "type": "large room"}, - {"id_": "HotelAlphaPoolside", "type": "poolside room"}, - ], "type": "resort", + "has_object": {"type": "piano"}, }, } hits = [models.Resource(**hit)] # type: ignore[arg-type] @@ -157,58 +132,41 @@ async def test_search_with_limit(joint_fixture: JointFixture): async def test_search_keywords(joint_fixture: JointFixture): """Make sure the query string is passed through intact""" - search_parameters: JsonObject = { - "class_name": "DatasetEmbedded", - "query": "hotel", - "filters": [], - "skip": 0, - } + params: QueryParams = {"class_name": "DatasetEmbedded", "query": "hotel"} - results = await joint_fixture.call_search_endpoint(search_parameters) + results = await joint_fixture.call_search_endpoint(params) compare(results=results, count=2, hit_length=2) async def test_search_filters(joint_fixture: JointFixture): """Make sure filters work""" - search_parameters: JsonObject = { + params: QueryParams = { "class_name": "DatasetEmbedded", - "query": "", - "filters": [{"key": "has_object.type", "value": "piano"}], - "skip": 0, + "filter_by": ["has_object.type"], + "value": ["piano"], } - results = await joint_fixture.call_search_endpoint(search_parameters) + results = await joint_fixture.call_search_endpoint(params) compare(results=results, count=1, hit_length=1) async def test_search_invalid_class(joint_fixture: JointFixture): """Verify that searching with a bad class name results in a 422""" - search_parameters: JsonObject = { - "class_name": "InvalidClassName", - "query": "", - "filters": [], - "skip": 0, - "limit": 1, - } + params: QueryParams = {"class_name": "InvalidClassName", "limit": 1} with pytest.raises(httpx.HTTPStatusError, match="422 Unprocessable Entity"): - await joint_fixture.call_search_endpoint(search_parameters) + await joint_fixture.call_search_endpoint(params) async def test_auto_recreation_of_indexes( joint_fixture: JointFixture, caplog: pytest.LogCaptureFixture ): """Make sure the indexes are recreated on the fly when they were deleted""" - search_parameters: JsonObject = { - "class_name": "DatasetEmbedded", - "query": "hotel", - "filters": [], - "skip": 0, - } + params: QueryParams = {"class_name": "DatasetEmbedded", "query": "hotel"} # should not give a warning when indexes are present with caplog.at_level(logging.WARNING): - await joint_fixture.call_search_endpoint(search_parameters) + await joint_fixture.call_search_endpoint(params) assert not caplog.records # drop all text indexes @@ -224,7 +182,7 @@ async def test_auto_recreation_of_indexes( # should work, but give a warning when indexes are recreated with caplog.at_level(logging.WARNING): - results = await joint_fixture.call_search_endpoint(search_parameters) + results = await joint_fixture.call_search_endpoint(params) compare(results=results, count=2, hit_length=2) assert len(caplog.records) == 1 diff --git a/tests/test_consumer.py b/tests/test_consumer.py index 8f37d79..8be1c21 100644 --- a/tests/test_consumer.py +++ b/tests/test_consumer.py @@ -42,17 +42,14 @@ async def test_resource_upsert( assert results_all.count > 0 # define content of resource - content = { + content: dict = { "has_object": {"type": "added-resource-object", "id": "98u44-f4jo4"}, "field1": "something", "category": "test object", } # define a resource to be upserted - resource = models.Resource( - id_=resource_id, - content=content, # type: ignore - ) + resource = models.Resource(id_=resource_id, content=content) # put together event payload payload = event_schemas.SearchableResource( @@ -81,6 +78,12 @@ async def test_resource_upsert( else: assert updated_resources.count == results_all.count + # remove unselected fields + content = resource.content # type: ignore + del content["field1"] + del content["category"] + del content["has_object"]["id"] + assert resource in updated_resources.hits assert resource not in results_all.hits diff --git a/tests/test_index_creation.py b/tests/test_index_creation.py index bbfd4e6..c4bedf4 100644 --- a/tests/test_index_creation.py +++ b/tests/test_index_creation.py @@ -30,7 +30,7 @@ QUERY_STRING = "Backrub" -@pytest.mark.parametrize("create_index_manually", (False, True)) +@pytest.mark.parametrize("create_index_manually", [False, True], ids=["auto", "manual"]) @pytest.mark.asyncio async def test_index_creation(joint_fixture: JointFixture, create_index_manually: bool): """Test the index creation function.""" diff --git a/tests/test_relevance.py b/tests/test_relevance.py index 5bb5e0b..da5b8b1 100644 --- a/tests/test_relevance.py +++ b/tests/test_relevance.py @@ -15,12 +15,10 @@ # """Tests for relevance sorting""" -from typing import Any - import pytest from mass.core import models -from tests.fixtures.joint import JointFixture +from tests.fixtures.joint import JointFixture, QueryParams CLASS_NAME: str = "RelevanceTests" RELEVANCE_SORT = models.SortingParameter( @@ -77,7 +75,7 @@ def sorted_reference_results( *, query: str, sorts: list[models.SortingParameter] | None = None, - filters: list[dict[str, Any]] | None = None, + filters: list[models.Filter] | None = None, ) -> list[str]: """Used to independently retrieve and sort results by relevance and then id""" if not sorts: @@ -88,16 +86,12 @@ def sorted_reference_results( ].find({"$text": {"$search": query}}, {"score": {"$meta": "textScore"}}) results = [x for x in results] # type: ignore - if filters: - for f in filters: - field = f["key"] - value = f["value"] - - # the only top-level fields are "_id" and "score" -- all else is in "content" - if field in ("_id", "score"): - results = [x for x in results if x[field] == value] # type: ignore - else: - results = [x for x in results if x["content"][field] == value] # type: ignore + for f in filters or []: + # the only top-level fields are "_id" and "score" -- all else is in "content" + if f.key in ("_id", "score"): + results = [x for x in results if x[f.key] == f.value] # type: ignore + else: + results = [x for x in results if x["content"][f.key] == f.value] # type: ignore sorted_results = multi_column_sort(results, sorts) # type: ignore @@ -108,15 +102,9 @@ def sorted_reference_results( async def test_happy_relevance(joint_fixture: JointFixture): """Make sure default works as expected""" query = "test" - search_parameters = { - "class_name": CLASS_NAME, - "query": query, - "filters": [], - } + params: QueryParams = {"class_name": CLASS_NAME, "query": query} - results = await joint_fixture.call_search_endpoint( - search_parameters=search_parameters - ) + results = await joint_fixture.call_search_endpoint(params) assert results.count == 5 reference_ids = sorted_reference_results( @@ -131,19 +119,14 @@ async def test_happy_relevance(joint_fixture: JointFixture): async def test_happy_relevance_descending_id(joint_fixture: JointFixture): """Make sure default Pydantic model parameter works as expected""" query = "test" - search_parameters = { + params: QueryParams = { "class_name": CLASS_NAME, "query": query, - "filters": [], - "sorting_parameters": [ - {"field": "query", "order": "relevance"}, - {"field": "id_", "order": "descending"}, - ], + "order_by": ["query", "id_"], + "sort": ["relevance", "descending"], } - results = await joint_fixture.call_search_endpoint( - search_parameters=search_parameters - ) + results = await joint_fixture.call_search_endpoint(params) assert results.count == 5 reference_ids = sorted_reference_results( @@ -156,15 +139,9 @@ async def test_happy_relevance_descending_id(joint_fixture: JointFixture): @pytest.mark.asyncio async def test_with_absent_term(joint_fixture: JointFixture): """Make sure nothing is pulled back with an absent term (sanity check)""" - search_parameters = { - "class_name": CLASS_NAME, - "query": "doesnotexistinourtests", - "filters": [], - } + params: QueryParams = {"class_name": CLASS_NAME, "query": "doesnotexistinourtests"} - results = await joint_fixture.call_search_endpoint( - search_parameters=search_parameters - ) + results = await joint_fixture.call_search_endpoint(params) assert results.count == 0 @@ -173,15 +150,9 @@ async def test_with_absent_term(joint_fixture: JointFixture): async def test_limited_term(joint_fixture: JointFixture): """Make sure only results with the term are retrieved""" query = "alternative" - search_parameters = { - "class_name": CLASS_NAME, - "query": query, - "filters": [], - } + params: QueryParams = {"class_name": CLASS_NAME, "query": query} - results = await joint_fixture.call_search_endpoint( - search_parameters=search_parameters - ) + results = await joint_fixture.call_search_endpoint(params) assert results.count == 2 reference_ids = sorted_reference_results(joint_fixture, query=query) @@ -193,15 +164,9 @@ async def test_limited_term(joint_fixture: JointFixture): async def test_two_words(joint_fixture: JointFixture): """Test with two different terms that appear in different fields""" query = "alternative test" - search_parameters = { - "class_name": CLASS_NAME, - "query": query, - "filters": [], - } + params: QueryParams = {"class_name": CLASS_NAME, "query": query} - results = await joint_fixture.call_search_endpoint( - search_parameters=search_parameters - ) + results = await joint_fixture.call_search_endpoint(params) assert results.count == 5 reference_ids = sorted_reference_results(joint_fixture, query=query) @@ -213,22 +178,19 @@ async def test_two_words(joint_fixture: JointFixture): async def test_with_filters(joint_fixture: JointFixture): """Test with filters applied but no sorting parameters""" query = "test" - filters = [{"key": "field", "value": "some data"}] - search_parameters = { + filters = [models.Filter(key="field", value="some data")] + params: QueryParams = { "class_name": CLASS_NAME, "query": query, - "filters": filters, + "filter_by": [f.key for f in filters], + "value": [f.value for f in filters], } - results = await joint_fixture.call_search_endpoint( - search_parameters=search_parameters - ) + results = await joint_fixture.call_search_endpoint(params) assert results.count == 1 reference_ids = sorted_reference_results( - joint_fixture, - query=query, - filters=filters, + joint_fixture, query=query, filters=filters ) assert [hit.id_ for hit in results.hits] == reference_ids @@ -238,20 +200,17 @@ async def test_with_filters(joint_fixture: JointFixture): async def test_with_filters_and_sorts(joint_fixture: JointFixture): """Test with filters applied and at least one sorting parameter (not relevance)""" query = "test" - filters = [{"key": "data", "value": "test test test test test"}] - search_parameters = { + filters = [models.Filter(key="data", value="test test test test test")] + params: QueryParams = { "class_name": CLASS_NAME, "query": query, - "filters": filters, - "sorting_parameters": [ - {"field": "field", "order": "ascending"}, - {"field": "id_", "order": "descending"}, - ], + "filter_by": [f.key for f in filters], + "value": [f.value for f in filters], + "order_by": ["field", "id_"], + "sort": ["ascending", "descending"], } - results = await joint_fixture.call_search_endpoint( - search_parameters=search_parameters - ) + results = await joint_fixture.call_search_endpoint(params) assert results.count == 2 reference_ids = sorted_reference_results( diff --git a/tests/test_resources.py b/tests/test_resources.py index 6dce50d..eb44b80 100644 --- a/tests/test_resources.py +++ b/tests/test_resources.py @@ -77,9 +77,9 @@ async def test_facets_returned(joint_fixture: JointFixture): ) config = get_config() - facets: list[models.FacetLabel] = config.searchable_classes[ + facets: list[models.FieldLabel] = config.searchable_classes[ "DatasetEmbedded" - ].facetable_properties + ].facetable_fields facet_key_to_name = {x.key: x.name for x in facets} for facet in results_faceted.facets: @@ -147,15 +147,14 @@ async def test_resource_load(joint_fixture: JointFixture): class_name="DatasetEmbedded", query="", filters=[] ) + content: dict = { + "has_object": {"type": "added-resource-object", "id": "98u44-f4jo4"}, + "field1": "something", + "category": "test object", + } + # define and load a new resource - resource = models.Resource( - id_="added-resource", - content={ - "has_object": {"type": "added-resource-object", "id": "98u44-f4jo4"}, - "field1": "something", - "category": "test object", - }, - ) + resource = models.Resource(id_="added-resource", content=content) await joint_fixture.query_handler.load_resource( resource=resource, class_name="DatasetEmbedded" @@ -177,7 +176,14 @@ async def test_resource_load(joint_fixture: JointFixture): assert len(target_search.hits) == 1 validated_resource = target_search.hits[0] assert validated_resource.id_ == resource.id_ - assert validated_resource.content == resource.content + + # remove unselected fields + content = resource.content # type: ignore + del content["field1"] + del content["category"] + del content["has_object"]["id"] + + assert validated_resource.content == content async def test_loading_non_configured_resource(joint_fixture: JointFixture): diff --git a/tests/test_sorting.py b/tests/test_sorting.py index e722872..97c4970 100644 --- a/tests/test_sorting.py +++ b/tests/test_sorting.py @@ -16,108 +16,112 @@ """Tests concerning the sorting functionality""" import pytest -from hexkit.custom_types import JsonObject from mass.core import models -from tests.fixtures.joint import JointFixture +from tests.fixtures.joint import JointFixture, QueryParams CLASS_NAME = "SortingTests" -BASIC_SORT_PARAMETERS = [ - models.SortingParameter(field="id_", order=models.SortOrder.ASCENDING) -] -def multi_column_sort( - resources: list[models.Resource], sorts: list[models.SortingParameter] +def sorted_resources( # noqa: C901 + resources: list[models.Resource], + order_by: list[str] | None = None, + sort: list[str] | None = None, + complete_resources: list[models.Resource] | None = None, ) -> list[models.Resource]: - """This is equivalent to nested sorted() calls. - - This uses the same approach as the sorting function in test_relevance, but the - difference is that this function uses Resource models and doesn't work with the - relevance sorting parameter. There's no spot for a top-level text score parameter in - the resource model, which is why the relevance tests use a slightly different version - of this function. - - The sorting parameters are supplied in order of most significant to least significant, - so we take them off the front and apply sorted(). If there are more parameters to - apply (more sorts), we recurse until we apply the final parameter. The sorted lists - are passed back up the call chain. + """Sort resources by all specified fields. + + This function simulates the sorting that is expected to be done by the database. + Since there's no spot for a top-level text score parameter in the resource model, + the relevance tests need to use a slightly different version of this function. + + In the case that some of the sorted fields are not part of the resources, the + complete resources which contain these missing fields must be passed as well. """ - sorted_list = resources.copy() - sorts = sorts.copy() - - parameter = sorts[0] - del sorts[0] - - # sort descending for DESCENDING and RELEVANCE - reverse = parameter.order != models.SortOrder.ASCENDING - - if len(sorts) > 0: - # if there are more sorting parameters, recurse to nest the sorts - sorted_list = multi_column_sort(sorted_list, sorts) - - if parameter.field == "id_": - return sorted( - sorted_list, - key=lambda result: result.model_dump()[parameter.field], - reverse=reverse, - ) - else: - # the only top-level fields is "_id" -- all else is in "content" - return sorted( - sorted_list, - key=lambda result: result.model_dump()["content"][parameter.field], - reverse=reverse, - ) + if order_by is None: + order_by = [] + if sort is None: + sort = [] + assert len(order_by) == len(sort) + if "id_" not in order_by: + # implicitly add id_ at the end since we also do it in the query handler + order_by.append("id_") + sort.append("ascending") + + def sort_key(resource: models.Resource) -> tuple: + """Create a tuple that can be used as key for sorting the resource.""" + if complete_resources: + for complete_resource in complete_resources: + if complete_resource.id_ == resource.id_: + resource = complete_resource + break + else: + assert False, f"{resource.id_} not found in complete resources" + key = [] + for field, field_sort in zip(order_by, sort, strict=True): + resource_dict = resource.model_dump() + if field != "id_": + # the only top-level fields is "_id" -- all else is in "content" + resource_dict = resource_dict["content"] + # support dotted access + sub_fields = field.split(".") + sub_fields, field = sub_fields[:-1], sub_fields[-1] + for sub_field in sub_fields: + resource_dict = resource_dict.get(sub_field, {}) + value = resource_dict.get(field) + # MongoDB returns nulls first, help Python to sort it properly + key_for_null = value is not None + if field_sort == "descending": + key_for_null = not key_for_null + if isinstance(value, str): + value = tuple(-ord(c) for c in value) + elif isinstance(value, int | float): + value = -value + key.append((key_for_null, value)) + return tuple(key) + + # sort the reversed resources to not rely on the already given order + return sorted(reversed(resources), key=sort_key) @pytest.mark.asyncio async def test_api_without_sort_parameters(joint_fixture: JointFixture): """Make sure default Pydantic model parameter works as expected""" - search_parameters: JsonObject = { - "class_name": CLASS_NAME, - "query": "", - "filters": [], - } + params: QueryParams = {"class_name": CLASS_NAME} - results = await joint_fixture.call_search_endpoint( - search_parameters=search_parameters - ) + results = await joint_fixture.call_search_endpoint(params) assert results.count > 0 - expected = multi_column_sort(results.hits, BASIC_SORT_PARAMETERS) + expected = sorted_resources(results.hits) assert results.hits == expected +@pytest.mark.parametrize("reverse", [False, True], ids=["normal", "reversed"]) @pytest.mark.asyncio -async def test_sort_with_id_not_last(joint_fixture: JointFixture): - """Test sorting parameters that contain id_, but id_ is not final sorting field. +async def test_sort_with_id_not_last(joint_fixture: JointFixture, reverse: bool): + """Test sorting parameters that contain id_, but not as the final sorting field. Since we modify sorting parameters based on presence of id_, make sure there aren't any bugs that will break the sort or query process. """ - sorts = [ - {"field": "id_", "order": "ascending"}, - {"field": "field", "order": "descending"}, - ] - search_parameters: JsonObject = { + order_by = ["id_", "field"] + sort = ["ascending", "descending"] + if reverse: + sort.reverse() + params: QueryParams = { "class_name": CLASS_NAME, "query": "", "filters": [], - "sorting_parameters": sorts, + "order_by": order_by, + "sort": sort, } - sorts_in_model_form = [ - models.SortingParameter( - field=param["field"], order=models.SortOrder(param["order"]) - ) - for param in sorts - ] - results = await joint_fixture.call_search_endpoint(search_parameters) - assert results.hits == multi_column_sort(results.hits, sorts_in_model_form) + results = await joint_fixture.call_search_endpoint(params) + assert results.hits == sorted_resources(results.hits, order_by, sort) +@pytest.mark.parametrize("reverse", [False, True], ids=["normal", "reversed"]) @pytest.mark.asyncio -async def test_sort_with_params_but_not_id(joint_fixture: JointFixture): +async def test_sort_with_params_but_not_id(joint_fixture: JointFixture, reverse: bool): """Test supplying sorting parameters but omitting id_. In order to provide consistent sorting, id_ should always be included. If it's not @@ -125,17 +129,16 @@ async def test_sort_with_params_but_not_id(joint_fixture: JointFixture): any tie between otherwise equivalent keys. If it is included but is not the final field, then we should not modify the parameters. """ - search_parameters: JsonObject = { + order_by = ["field"] + sort = ["descending" if reverse else "ascending"] + params: QueryParams = { "class_name": CLASS_NAME, - "query": "", - "filters": [], - "sorting_parameters": [ - {"field": "field", "order": models.SortOrder.ASCENDING.value} - ], + "order_by": order_by, + "sort": sort, } - results = await joint_fixture.call_search_endpoint(search_parameters) - assert results.hits == multi_column_sort(results.hits, BASIC_SORT_PARAMETERS) + results = await joint_fixture.call_search_endpoint(params) + assert results.hits == sorted_resources(results.hits, order_by, sort) @pytest.mark.asyncio @@ -146,52 +149,44 @@ async def test_sort_with_invalid_field(joint_fixture: JointFixture): value for it. If we sort with a truly invalid field, it should have no impact on the resulting sort order. """ - search_parameters: JsonObject = { + params: QueryParams = { "class_name": CLASS_NAME, - "query": "", - "filters": [], - "sorting_parameters": [ - { - "field": "some_bogus_field", - "order": models.SortOrder.ASCENDING.value, - } - ], + "order_by": ["some_bogus_field"], + "sort": ["ascending"], } - results = await joint_fixture.call_search_endpoint(search_parameters) - assert results.hits == multi_column_sort(results.hits, BASIC_SORT_PARAMETERS) + results = await joint_fixture.call_search_endpoint(params) + assert results.hits == sorted_resources(results.hits) @pytest.mark.parametrize("order", [-7, 17, "some_string"]) @pytest.mark.asyncio -async def test_sort_with_invalid_sort_order(joint_fixture: JointFixture, order): +async def test_sort_with_invalid_sort_order( + joint_fixture: JointFixture, order: str | int +): """Test supplying an invalid value for the sort order""" - search_parameters: JsonObject = { + params: QueryParams = { "class_name": CLASS_NAME, - "query": "", - "filters": [], - "sorting_parameters": [{"field": "field", "order": order}], + "order_by": ["field"], + "sort": [order], # type: ignore } - response = await joint_fixture.rest_client.post( - url="/rpc/search", json=search_parameters - ) + response = await joint_fixture.rest_client.get(url="/search", params=params) assert response.status_code == 422 + detail = response.json()["detail"] + assert "Input should be 'ascending', 'descending' or 'relevance'" in str(detail) @pytest.mark.asyncio async def test_sort_with_invalid_field_and_sort_order(joint_fixture: JointFixture): """Test with both invalid field name and invalid sort order.""" - search_parameters: JsonObject = { + params: QueryParams = { "class_name": CLASS_NAME, - "query": "", - "filters": [], - "sorting_parameters": [{"field": "some_bogus_field", "order": -7}], + "order_by": ["some_bogus_field"], + "sort": ["also_bogus"], } - response = await joint_fixture.rest_client.post( - url="/rpc/search", json=search_parameters - ) + response = await joint_fixture.rest_client.get(url="/search", params=params) assert response.status_code == 422 @@ -201,16 +196,105 @@ async def test_sort_with_duplicate_field(joint_fixture: JointFixture): This should be prevented by the pydantic model validator and raise an HTTP error. """ - search_parameters: JsonObject = { + params = { "class_name": CLASS_NAME, - "query": "", - "filters": [], - "sorting_parameters": [ - {"field": "field", "order": models.SortOrder.ASCENDING.value}, - {"field": "field", "order": models.SortOrder.DESCENDING.value}, - ], + "order_by": ["field", "field"], + "sort": [models.SortOrder.ASCENDING.value, models.SortOrder.DESCENDING.value], } - response = await joint_fixture.rest_client.post( - url="/rpc/search", json=search_parameters - ) + + response = await joint_fixture.rest_client.get(url="/search", params=params) + assert response.status_code == 422 + assert response.json()["detail"] == "Fields to order by must be unique" + + +@pytest.mark.asyncio +async def test_sort_with_missing_sort(joint_fixture: JointFixture): + """Supply sorting parameters with missing sort option. + + This should be prevented by the pydantic model validator and raise an HTTP error. + """ + params = { + "class_name": CLASS_NAME, + "order_by": ["field"], + } + + response = await joint_fixture.rest_client.get(url="/search", params=params) + assert response.status_code == 422 + details = response.json()["detail"] + assert details == "Number of fields to order by must match number of sort options" + + +@pytest.mark.asyncio +async def test_sort_with_superfluous_sort(joint_fixture: JointFixture): + """Supply sorting parameters with superfluous sort option. + + This should be prevented by the pydantic model validator and raise an HTTP error. + """ + params = { + "class_name": CLASS_NAME, + "order_by": ["field"], + "sort": [models.SortOrder.ASCENDING.value, models.SortOrder.DESCENDING.value], + } + + response = await joint_fixture.rest_client.get(url="/search", params=params) assert response.status_code == 422 + details = response.json()["detail"] + assert details == "Number of fields to order by must match number of sort options" + + +@pytest.mark.parametrize("reverse", [False, True], ids=["normal", "reversed"]) +@pytest.mark.parametrize("field", ["type", "has_object.type"]) +@pytest.mark.asyncio +async def test_sort_with_one_of_the_selected_fields( + joint_fixture: JointFixture, reverse: bool, field: str +): + """Test sorting when fields are selected and one of them is used for sorting.""" + class_name = "DatasetEmbedded" + selected = joint_fixture.config.searchable_classes[class_name].selected_fields + assert selected # this resource has selected fields + assert any(f.key == field for f in selected) # field is selected + + order_by = [field] + sort = ["descending" if reverse else "ascending"] + params: QueryParams = { + "class_name": class_name, + "order_by": order_by, + "sort": sort, + } + + results = await joint_fixture.call_search_endpoint(params) + assert results.hits == sorted_resources(results.hits, order_by, sort) + + +@pytest.mark.parametrize("reverse", [False, True], ids=["normal", "reversed"]) +@pytest.mark.parametrize("field", ["category", "field1"]) +@pytest.mark.asyncio +async def test_sort_with_one_of_the_unselected_fields( + joint_fixture: JointFixture, reverse: bool, field: str +): + """Test sorting when fields are selected but sorted by an unselected field.""" + class_name = "DatasetEmbedded" + selected = joint_fixture.config.searchable_classes[class_name].selected_fields + assert selected # this resource has selected fields + assert not any(f.key == field for f in selected) # field is unselected + + order_by = [field] + sort = ["descending" if reverse else "ascending"] + params: QueryParams = { + "class_name": class_name, + "order_by": order_by, + "sort": sort, + } + + results = await joint_fixture.call_search_endpoint(params) + + # make sure the field is not returned in the results + for resource in results.hits: + assert field not in resource.content + + # therefore, we cannot just sort the results, + # but we need to fetch the field from the complete original resources + complete_resources = joint_fixture.resources[class_name] + assert results.hits == sorted_resources( + results.hits, order_by, sort, complete_resources + )