From 27251e8a036275569f7ebb8c7c0aafc93a336992 Mon Sep 17 00:00:00 2001 From: Javier Torres Date: Thu, 1 Aug 2024 16:17:02 +0200 Subject: [PATCH] Revert "Predict is compatible with vectorsets (#2344)" (#2369) This reverts commit 2110811d85c1cb79234d10010d6e9c501e54ace7. --- .../common/datamanagers/vectorsets.py | 32 ++++--- nucliadb/src/nucliadb/search/predict.py | 42 ++-------- nucliadb/src/nucliadb/search/search/query.py | 84 ++++--------------- nucliadb/tests/ingest/fixtures.py | 4 +- .../integration/common/test_locking.py | 2 +- .../integration/search/test_autofilters.py | 19 +---- .../nucliadb/integration/test_vectorsets.py | 57 ++++--------- .../tests/search/unit/search/test_query.py | 3 +- .../src/nucliadb_models/internal/__init__.py | 19 ----- .../src/nucliadb_models/internal/predict.py | 68 --------------- nucliadb_models/src/nucliadb_models/search.py | 31 ++++++- nucliadb_utils/src/nucliadb_utils/const.py | 1 - .../src/nucliadb_utils/featureflagging.py | 4 - 13 files changed, 98 insertions(+), 268 deletions(-) delete mode 100644 nucliadb_models/src/nucliadb_models/internal/__init__.py delete mode 100644 nucliadb_models/src/nucliadb_models/internal/predict.py diff --git a/nucliadb/src/nucliadb/common/datamanagers/vectorsets.py b/nucliadb/src/nucliadb/common/datamanagers/vectorsets.py index 26bff4c842..73642b75cd 100644 --- a/nucliadb/src/nucliadb/common/datamanagers/vectorsets.py +++ b/nucliadb/src/nucliadb/common/datamanagers/vectorsets.py @@ -21,15 +21,11 @@ from nucliadb.common.datamanagers.utils import get_kv_pb from nucliadb.common.maindb.driver import Transaction -from nucliadb_protos import knowledgebox_pb2 +from nucliadb_protos import knowledgebox_pb2, nodewriter_pb2 KB_VECTORSETS = "/kbs/{kbid}/vectorsets" -class BrokenInvariant(Exception): - pass - - async def initialize(txn: Transaction, *, kbid: str): key = KB_VECTORSETS.format(kbid=kbid) await txn.set(key, knowledgebox_pb2.KnowledgeBoxVectorSetsConfig().SerializeToString()) @@ -50,12 +46,26 @@ async def get_default_vectorset( *, kbid: str, ) -> knowledgebox_pb2.VectorSetConfig: - """XXX: For now, default vectorset is the first on the list, we should - implement an API to let users decide which is their default though - """ - async for _, vectorset in iter(txn, kbid=kbid): - return vectorset - raise BrokenInvariant("KB without vectorsets this shouldn't be possible!") + from . import kb + + vectorset_id = "__default__" + semantic_model = await kb.get_model_metadata(txn, kbid=kbid) + vector_dimension = semantic_model.vector_dimension + similarity = semantic_model.similarity_function + matryoshka_dimensions = semantic_model.matryoshka_dimensions + normalize_vectors = len(matryoshka_dimensions) > 0 + + return knowledgebox_pb2.VectorSetConfig( + vectorset_id=vectorset_id, + vectorset_index_config=nodewriter_pb2.VectorIndexConfig( + vector_dimension=vector_dimension, + similarity=similarity, + # we only support this for now + vector_type=nodewriter_pb2.VectorType.DENSE_F32, + normalize_vectors=normalize_vectors, + ), + matryoshka_dimensions=matryoshka_dimensions, + ) async def exists(txn, *, kbid: str, vectorset_id: str) -> bool: diff --git a/nucliadb/src/nucliadb/search/predict.py b/nucliadb/src/nucliadb/search/predict.py index 83a55e0bc6..b6e7e5e233 100644 --- a/nucliadb/src/nucliadb/search/predict.py +++ b/nucliadb/src/nucliadb/search/predict.py @@ -29,18 +29,21 @@ from nucliadb.search import logger from nucliadb.tests.vectors import Q, Qm2023 -from nucliadb_models.internal.predict import Ner, QueryInfo, SentenceSearch, TokenSearch from nucliadb_models.search import ( ChatModel, FeedbackRequest, + Ner, + QueryInfo, RephraseModel, + SentenceSearch, SummarizedResource, SummarizedResponse, SummarizeModel, + TokenSearch, ) from nucliadb_protos.utils_pb2 import RelationNode from nucliadb_telemetry import errors, metrics -from nucliadb_utils.const import Features +from nucliadb_utils import const from nucliadb_utils.exceptions import LimitsExceededError from nucliadb_utils.settings import nuclia_settings from nucliadb_utils.utilities import Utility, has_feature, set_utility @@ -226,7 +229,7 @@ def get_predict_url(self, endpoint: str, kbid: str) -> str: # /api/v1/predict/rephrase/{kbid} return f"{self.public_url}{PUBLIC_PREDICT}{endpoint}/{kbid}" else: - if has_feature(Features.VERSIONED_PRIVATE_PREDICT): + if has_feature(const.Features.VERSIONED_PRIVATE_PREDICT): return f"{self.cluster_url}{VERSIONED_PRIVATE_PREDICT}{endpoint}" else: return f"{self.cluster_url}{PRIVATE_PREDICT}{endpoint}" @@ -375,7 +378,6 @@ async def query( self, kbid: str, sentence: str, - semantic_model: Optional[str] = None, generative_model: Optional[str] = None, rephrase: Optional[bool] = False, ) -> QueryInfo: @@ -386,13 +388,10 @@ async def query( logger.warning(error) raise SendToPredictError(error) - params: dict[str, Any] = { + params = { "text": sentence, "rephrase": str(rephrase), } - if has_feature(Features.VECTORSETS_V0, context={"kbid": kbid}): - if semantic_model is not None: - params["semantic_models"] = [semantic_model] if generative_model is not None: params["generative_model"] = generative_model @@ -520,7 +519,6 @@ async def query( self, kbid: str, sentence: str, - semantic_model: Optional[str] = None, generative_model: Optional[str] = None, rephrase: Optional[bool] = False, ) -> QueryInfo: @@ -530,20 +528,10 @@ async def query( language="en", stop_words=[], semantic_threshold=0.7, - semantic_thresholds={semantic_model or "": 0.7}, visual_llm=True, max_context=self.max_context, entities=TokenSearch(tokens=[Ner(text="text", ner="PERSON", start=0, end=2)], time=0.0), - sentence=SentenceSearch( - data=Qm2023, - vectors={ - semantic_model or "": Qm2023, - }, - time=0.0, - timings={ - semantic_model or "": 0.0, - }, - ), + sentence=SentenceSearch(data=Qm2023, time=0.0), query=sentence, ) else: @@ -551,22 +539,10 @@ async def query( language="en", stop_words=[], semantic_threshold=0.7, - semantic_thresholds={ - semantic_model or "": 0.7, - }, visual_llm=True, max_context=self.max_context, entities=TokenSearch(tokens=[Ner(text="text", ner="PERSON", start=0, end=2)], time=0.0), - sentence=SentenceSearch( - data=Q, - vectors={ - semantic_model or "": Q, - }, - time=0.0, - timings={ - semantic_model or "": 0.0, - }, - ), + sentence=SentenceSearch(data=Q, time=0.0), query=sentence, ) diff --git a/nucliadb/src/nucliadb/search/search/query.py b/nucliadb/src/nucliadb/search/search/query.py index d04aa76004..cb1d78d3f2 100644 --- a/nucliadb/src/nucliadb/search/search/query.py +++ b/nucliadb/src/nucliadb/search/search/query.py @@ -25,7 +25,6 @@ from async_lru import alru_cache from nucliadb.common import datamanagers -from nucliadb.common.datamanagers.vectorsets import BrokenInvariant from nucliadb.common.maindb.utils import get_driver from nucliadb.search import logger from nucliadb.search.predict import SendToPredictError, convert_relations @@ -42,13 +41,13 @@ query_parse_dependency_observer, ) from nucliadb.search.utilities import get_predict -from nucliadb_models.internal.predict import QueryInfo from nucliadb_models.labels import translate_system_to_alias_label from nucliadb_models.metadata import ResourceProcessingStatus from nucliadb_models.search import ( Filter, MaxTokens, MinScore, + QueryInfo, SearchOptions, SortField, SortFieldMap, @@ -60,8 +59,6 @@ from nucliadb_models.security import RequestSecurity from nucliadb_protos import knowledgebox_pb2, nodereader_pb2, utils_pb2 from nucliadb_protos.noderesources_pb2 import Resource -from nucliadb_utils.const import Features -from nucliadb_utils.utilities import has_feature from .exceptions import InvalidQueryError @@ -137,12 +134,7 @@ def __init__( self.range_modification_end = range_modification_end self.fields = fields or [] self.user_vector = user_vector - # until vectorsets is properly implemented, we'll have this parameter - # under FF and always set None for anyone else - if has_feature(Features.VECTORSETS_V0, context={"kbid": kbid}): - self.vectorset = vectorset - else: - self.vectorset = None + self.vectorset = vectorset self.with_duplicates = with_duplicates self.with_status = with_status self.with_synonyms = with_synonyms @@ -167,15 +159,11 @@ def has_relations_search(self) -> bool: def _get_query_information(self) -> Awaitable[QueryInfo]: if self._query_information_task is None: # pragma: no cover - self._query_information_task = asyncio.create_task(self._query_information()) + self._query_information_task = asyncio.create_task( + query_information(self.kbid, self.query, self.generative_model, self.rephrase) + ) return self._query_information_task - async def _query_information(self) -> QueryInfo: - vectorset = await self.select_vectorset() - return await query_information( - self.kbid, self.query, vectorset, self.generative_model, self.rephrase - ) - def _get_matryoshka_dimension(self) -> Awaitable[Optional[int]]: if self._get_matryoshka_dimension_task is None: self._get_matryoshka_dimension_task = asyncio.create_task( @@ -379,41 +367,6 @@ def parse_paragraph_search(self, request: nodereader_pb2.SearchRequest) -> None: request.paragraph = True node_features.inc({"type": "paragraphs"}) - @alru_cache(maxsize=1) - async def select_vectorset(self) -> Optional[str]: - """Validate the vectorset parameter and override it with a default if - needed. - """ - if not has_feature(Features.VECTORSETS_V0, context={"kbid": self.kbid}): - return None - if self.vectorset: - # validate vectorset - async with datamanagers.with_ro_transaction() as txn: - if not await datamanagers.vectorsets.exists( - txn, kbid=self.kbid, vectorset_id=self.vectorset - ): - raise InvalidQueryError( - "vectorset", - f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box", - ) - return self.vectorset - else: - # no vectorset specified, get the default one - async with datamanagers.with_ro_transaction() as txn: - try: - default_vectorset = await datamanagers.vectorsets.get_default_vectorset( - txn, kbid=self.kbid - ) - except BrokenInvariant: - # XXX: fix to avoid tests complaining too much, it should be - # an error at some point though - return None - # logger.exception("KB has no default vectorset", extra={"kbid": self.kbid}) - # raise InvalidQueryError("vectorset", f"KB has no default vectorset") from exc - else: - return default_vectorset.vectorset_id - return None - async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bool: if not self.has_vector_search: return False @@ -421,11 +374,6 @@ async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bo node_features.inc({"type": "vectors"}) incomplete = False - - vectorset = await self.select_vectorset() - if vectorset is not None: - request.vectorset = vectorset - query_vector = None if self.user_vector is None: try: @@ -435,18 +383,23 @@ async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bo incomplete = True else: if query_info and query_info.sentence: - if vectorset and has_feature(Features.VECTORSETS_V0, context={"kbid": self.kbid}): - if vectorset in query_info.sentence.vectors: - query_vector = query_info.sentence.vectors[vectorset] - else: - incomplete = True - else: - query_vector = query_info.sentence.data + query_vector = query_info.sentence.data else: incomplete = True else: query_vector = self.user_vector + if self.vectorset: + async with get_driver().transaction(read_only=True) as txn: + if not await datamanagers.vectorsets.exists( + txn, kbid=self.kbid, vectorset_id=self.vectorset + ): + raise InvalidQueryError( + "vectorset", + f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box", + ) + request.vectorset = self.vectorset + if query_vector is not None: matryoshka_dimension = await self._get_matryoshka_dimension() if matryoshka_dimension is not None: @@ -601,12 +554,11 @@ async def paragraph_query_to_pb( async def query_information( kbid: str, query: str, - semantic_model: Optional[str], generative_model: Optional[str] = None, rephrase: bool = False, ) -> QueryInfo: predict = get_predict() - return await predict.query(kbid, query, semantic_model, generative_model, rephrase) + return await predict.query(kbid, query, generative_model, rephrase) @query_parse_dependency_observer.wrap({"type": "detect_entities"}) diff --git a/nucliadb/tests/ingest/fixtures.py b/nucliadb/tests/ingest/fixtures.py index 411f550f0e..0f6564523b 100644 --- a/nucliadb/tests/ingest/fixtures.py +++ b/nucliadb/tests/ingest/fixtures.py @@ -190,9 +190,7 @@ async def knowledgebox_ingest(storage, maindb_driver: Driver, shard_manager, lea model = SemanticModelMetadata( similarity_function=upb.VectorSimilarity.COSINE, vector_dimension=len(V1) ) - await KnowledgeBox.create( - maindb_driver, kbid=kbid, slug=kbslug, semantic_models={"my-semantic-model": model} - ) + await KnowledgeBox.create(maindb_driver, kbid=kbid, slug=kbslug, semantic_model=model) yield kbid diff --git a/nucliadb/tests/nucliadb/integration/common/test_locking.py b/nucliadb/tests/nucliadb/integration/common/test_locking.py index f4b544dc66..53c8bf40e5 100644 --- a/nucliadb/tests/nucliadb/integration/common/test_locking.py +++ b/nucliadb/tests/nucliadb/integration/common/test_locking.py @@ -72,7 +72,7 @@ async def test_lock(for_seconds: float): tasks = [] for _ in range(5): - tasks.append(asyncio.create_task(test_lock(random.uniform(0.1, 0.2)))) + tasks.append(asyncio.create_task(test_lock(random.uniform(0, 0.2)))) results = await asyncio.gather(*tasks, return_exceptions=True) # Check that 4 out of 5 tasks returned ResourceLocked error diff --git a/nucliadb/tests/nucliadb/integration/search/test_autofilters.py b/nucliadb/tests/nucliadb/integration/search/test_autofilters.py index 477a583718..73684ac45b 100644 --- a/nucliadb/tests/nucliadb/integration/search/test_autofilters.py +++ b/nucliadb/tests/nucliadb/integration/search/test_autofilters.py @@ -23,7 +23,7 @@ from httpx import AsyncClient from nucliadb.tests.vectors import Q -from nucliadb_models.internal.predict import Ner, QueryInfo, SentenceSearch, TokenSearch +from nucliadb_models.search import Ner, QueryInfo, SentenceSearch, TokenSearch from nucliadb_utils.utilities import Utility, set_utility @@ -40,12 +40,6 @@ async def test_autofilters_are_returned( predict_mock.query = AsyncMock( return_value=QueryInfo( - language="en", - stop_words=[], - semantic_threshold=0.7, - semantic_thresholds={ - "my-semantic-model": 0.7, - }, entities=TokenSearch( tokens=[ Ner(text="Newton", ner="scientist", start=0, end=1), @@ -53,16 +47,7 @@ async def test_autofilters_are_returned( ], time=0.1, ), - sentence=SentenceSearch( - data=Q, - vectors={ - "my-semantic-model": Q, - }, - time=0.1, - timings={ - "my-semantic-model": 0.1, - }, - ), + sentence=SentenceSearch(data=Q, time=0.1), visual_llm=False, max_context=10000, query="What relates Newton and Becquer?", diff --git a/nucliadb/tests/nucliadb/integration/test_vectorsets.py b/nucliadb/tests/nucliadb/integration/test_vectorsets.py index a7c8e2e7a3..d9767cecbd 100644 --- a/nucliadb/tests/nucliadb/integration/test_vectorsets.py +++ b/nucliadb/tests/nucliadb/integration/test_vectorsets.py @@ -201,13 +201,11 @@ async def query_shard_wrapper( query = (spy, result, None) return result - def predict_query_wrapper(original, dimension: int, vectorset_dimensions: dict[str, int]): + def predict_query_wrapper(original, dimension): @functools.wraps(original) async def inner(*args, **kwargs): query_info = await original(*args, **kwargs) query_info.sentence.data = [1.0] * dimension - for vectorset_id, vectorset_dimension in vectorset_dimensions.items(): - query_info.sentence.vectors[vectorset_id] = [1.0] * vectorset_dimension return query_info return inner @@ -236,9 +234,7 @@ async def inner(*args, **kwargs): ): with ( patch.object( - dummy_predict, - "query", - side_effect=predict_query_wrapper(dummy_predict.query, 768, {"model": 768}), + dummy_predict, "query", side_effect=predict_query_wrapper(dummy_predict.query, 768) ), ): resp = await nucliadb_reader.post( @@ -250,12 +246,10 @@ async def inner(*args, **kwargs): assert resp.status_code == 200 node_search_spy, result, error = query - assert result is not None assert error is None request = node_search_spy.call_args[0][0] - # there's only one model and we get it as the default - assert request.vectorset == "model" + assert request.vectorset == "" assert len(request.vector) == 768 resp = await nucliadb_reader.post( @@ -268,7 +262,6 @@ async def inner(*args, **kwargs): assert resp.status_code == 200 node_search_spy, result, error = query - assert result is not None assert error is None request = node_search_spy.call_args[0][0] @@ -304,9 +297,7 @@ async def inner(*args, **kwargs): ): with ( patch.object( - dummy_predict, - "query", - side_effect=predict_query_wrapper(dummy_predict.query, 500, {"model-A": 768}), + dummy_predict, "query", side_effect=predict_query_wrapper(dummy_predict.query, 768) ), ): resp = await nucliadb_reader.post( @@ -319,7 +310,6 @@ async def inner(*args, **kwargs): assert resp.status_code == 200 node_search_spy, result, error = query - assert result is not None assert error is None request = node_search_spy.call_args[0][0] @@ -328,9 +318,7 @@ async def inner(*args, **kwargs): with ( patch.object( - dummy_predict, - "query", - side_effect=predict_query_wrapper(dummy_predict.query, 500, {"model-B": 1024}), + dummy_predict, "query", side_effect=predict_query_wrapper(dummy_predict.query, 1024) ), ): resp = await nucliadb_reader.post( @@ -343,35 +331,24 @@ async def inner(*args, **kwargs): assert resp.status_code == 200 node_search_spy, result, error = query - assert result is not None assert error is None request = node_search_spy.call_args[0][0] assert request.vectorset == "model-B" assert len(request.vector) == 1024 - with ( - patch.object( - dummy_predict, - "query", - side_effect=predict_query_wrapper( - dummy_predict.query, 500, {"model-A": 768, "model-B": 1024} - ), - ), - ): - resp = await nucliadb_reader.get( - f"/kb/{kbid}/find", - params={ - "query": "foo", - }, - ) - assert resp.status_code == 200 - node_search_spy, result, error = query - request = node_search_spy.call_args[0][0] - assert result is not None - assert error is None - # with more than one vectorset, we get the first one - assert request.vectorset == "model-A" + resp = await nucliadb_reader.get( + f"/kb/{kbid}/find", + params={ + "query": "foo", + }, + ) + assert resp.status_code == 500 + node_search_spy, result, error = query + request = node_search_spy.call_args[0][0] + assert result is None + assert request.vectorset == "" + assert "Query without vectorset but shard has multiple vector indexes" in str(error) @pytest.fixture(scope="function") diff --git a/nucliadb/tests/search/unit/search/test_query.py b/nucliadb/tests/search/unit/search/test_query.py index 5df01bf0cb..d562fe650a 100644 --- a/nucliadb/tests/search/unit/search/test_query.py +++ b/nucliadb/tests/search/unit/search/test_query.py @@ -184,8 +184,7 @@ async def test_query_without_vectorset_nor_matryoshka( "nucliadb.search.search.query.get_matryoshka_dimension_cached", new=AsyncMock(return_value=matryoshka_dimension), ), - patch("nucliadb.common.datamanagers.utils.get_driver"), - patch("nucliadb.common.datamanagers.vectorsets.get_kv_pb"), + patch("nucliadb.search.search.query.get_driver"), ): request, incomplete, _ = await parser.parse() assert not incomplete diff --git a/nucliadb_models/src/nucliadb_models/internal/__init__.py b/nucliadb_models/src/nucliadb_models/internal/__init__.py deleted file mode 100644 index 3b734776ac..0000000000 --- a/nucliadb_models/src/nucliadb_models/internal/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (C) 2021 Bosutech XXI S.L. -# -# nucliadb is offered under the AGPL v3.0 and as commercial software. -# For commercial licensing, contact us at info@nuclia.com. -# -# AGPL: -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# diff --git a/nucliadb_models/src/nucliadb_models/internal/predict.py b/nucliadb_models/src/nucliadb_models/internal/predict.py deleted file mode 100644 index 6a1f312536..0000000000 --- a/nucliadb_models/src/nucliadb_models/internal/predict.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (C) 2021 Bosutech XXI S.L. -# -# nucliadb is offered under the AGPL v3.0 and as commercial software. -# For commercial licensing, contact us at info@nuclia.com. -# -# AGPL: -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# - -""" -Models for Predict API v1. - -ATENTION! Keep these models in sync with models on Predict API -""" - -from typing import List, Optional - -from pydantic import BaseModel, Field - - -class SentenceSearch(BaseModel): - data: List[float] = Field(deprecated=True, default_factory=list) - vectors: dict[str, List[float]] = Field( - default_factory=dict, description="Sentence vectors for each semantic model" - ) - time: float = Field(deprecated=True) - timings: dict[str, float] = Field( - default_factory=dict, - description="Time taken to compute the sentence vector for each semantic model", - ) - - -class Ner(BaseModel): - text: str - ner: str - start: int - end: int - - -class TokenSearch(BaseModel): - tokens: List[Ner] = [] - time: float - input_tokens: int = 0 - - -class QueryInfo(BaseModel): - language: Optional[str] - stop_words: List[str] = Field(default_factory=list) - semantic_threshold: float = Field(deprecated=True) - semantic_thresholds: dict[str, float] = Field( - default_factory=dict, description="Semantic threshold for each semantic model" - ) - visual_llm: bool - max_context: int - entities: Optional[TokenSearch] - sentence: Optional[SentenceSearch] - query: str diff --git a/nucliadb_models/src/nucliadb_models/search.py b/nucliadb_models/src/nucliadb_models/search.py index 2a0f9ac1ce..479f4f4701 100644 --- a/nucliadb_models/src/nucliadb_models/search.py +++ b/nucliadb_models/src/nucliadb_models/search.py @@ -39,9 +39,6 @@ from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject from nucliadb_protos.writer_pb2 import Shards as PBShards -# Bw/c import to avoid breaking users -from nucliadb_models.internal.predict import Ner, QueryInfo, SentenceSearch, TokenSearch # noqa isort: skip - _T = TypeVar("_T") ANSWER_JSON_SCHEMA_EXAMPLE = { @@ -287,6 +284,34 @@ class EntitySubgraph(BaseModel): # path: List[DirectionalRelation] +class SentenceSearch(BaseModel): + data: List[float] = [] + time: float + + +class Ner(BaseModel): + text: str + ner: str + start: int + end: int + + +class TokenSearch(BaseModel): + tokens: List[Ner] = [] + time: float + + +class QueryInfo(BaseModel): + language: Optional[str] = None + stop_words: List[str] = [] + semantic_threshold: Optional[float] = None + visual_llm: bool + max_context: int + entities: TokenSearch + sentence: SentenceSearch + query: str + + class Relations(BaseModel): entities: Dict[str, EntitySubgraph] # TODO: implement in the next iteration of knowledge graph search diff --git a/nucliadb_utils/src/nucliadb_utils/const.py b/nucliadb_utils/src/nucliadb_utils/const.py index 89863e2ead..3bfaf79442 100644 --- a/nucliadb_utils/src/nucliadb_utils/const.py +++ b/nucliadb_utils/src/nucliadb_utils/const.py @@ -81,4 +81,3 @@ class Features: FIND_MERGE_ORDER_FIX = "nucliadb_find_merge_order_fix" PG_CATALOG_READ = "nucliadb_pg_catalog_read" PG_CATALOG_WRITE = "nucliadb_pg_catalog_write" - VECTORSETS_V0 = "vectorsets_v0_new_kbs_with_multiple_vectorsets" diff --git a/nucliadb_utils/src/nucliadb_utils/featureflagging.py b/nucliadb_utils/src/nucliadb_utils/featureflagging.py index 0eab147c2b..9bbb2aac62 100644 --- a/nucliadb_utils/src/nucliadb_utils/featureflagging.py +++ b/nucliadb_utils/src/nucliadb_utils/featureflagging.py @@ -73,10 +73,6 @@ class Settings(pydantic_settings.BaseSettings): "rollout": 0, "variants": {"environment": ["local"]}, }, - const.Features.VECTORSETS_V0: { - "rollout": 0, - "variants": {"environment": ["local"]}, - }, }