Skip to content

Commit

Permalink
Refactor shards to be vectorset compatible (#2416)
Browse files Browse the repository at this point in the history
* Delete default vector index details from model

After vectorsets, these fields are deprecated and don't make sense
anymore

* Move KnowledgeboxShards model to internal models

/kb/{kbid}/shards endpoint is not exposed

* Add Bw/c imports just in case
  • Loading branch information
jotare authored Aug 28, 2024
1 parent 5d37539 commit 88804b1
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 78 deletions.
2 changes: 1 addition & 1 deletion nucliadb/src/nucliadb/search/api/v1/knowledgebox.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
from nucliadb.search.api.v1.utils import fastapi_query
from nucliadb.search.search.shards import get_shard
from nucliadb.search.settings import settings
from nucliadb_models.internal.shards import KnowledgeboxShards
from nucliadb_models.resource import NucliaDBRoles
from nucliadb_models.search import (
KnowledgeboxCounters,
KnowledgeboxShards,
SearchParamDefaults,
)
from nucliadb_protos.noderesources_pb2 import Shard
Expand Down
95 changes: 95 additions & 0 deletions nucliadb_models/src/nucliadb_models/internal/shards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (C) 2021 Bosutech XXI S.L.
#
# nucliadb is offered under the AGPL v3.0 and as commercial software.
# For commercial licensing, contact us at [email protected].
#
# AGPL:
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from enum import Enum
from typing import List, Type, TypeVar

from google.protobuf.json_format import MessageToDict
from pydantic import BaseModel

from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
from nucliadb_protos.writer_pb2 import Shards as PBShards

_T = TypeVar("_T")


class DocumentServiceEnum(str, Enum):
DOCUMENT_V0 = "DOCUMENT_V0"
DOCUMENT_V1 = "DOCUMENT_V1"
DOCUMENT_V2 = "DOCUMENT_V2"


class ParagraphServiceEnum(str, Enum):
PARAGRAPH_V0 = "PARAGRAPH_V0"
PARAGRAPH_V1 = "PARAGRAPH_V1"
PARAGRAPH_V2 = "PARAGRAPH_V2"
PARAGRAPH_V3 = "PARAGRAPH_V3"


class VectorServiceEnum(str, Enum):
VECTOR_V0 = "VECTOR_V0"
VECTOR_V1 = "VECTOR_V1"


class RelationServiceEnum(str, Enum):
RELATION_V0 = "RELATION_V0"
RELATION_V1 = "RELATION_V1"
RELATION_V2 = "RELATION_V2"


class ShardCreated(BaseModel):
id: str
document_service: DocumentServiceEnum
paragraph_service: ParagraphServiceEnum
vector_service: VectorServiceEnum
relation_service: RelationServiceEnum


class ShardReplica(BaseModel):
node: str
shard: ShardCreated


class ShardObject(BaseModel):
shard: str
replicas: List[ShardReplica]

@classmethod
def from_message(cls: Type[_T], message: PBShardObject) -> _T:
return cls(
**MessageToDict(
message,
preserving_proto_field_name=True,
including_default_value_fields=True,
)
)


class KnowledgeboxShards(BaseModel):
kbid: str
shards: List[ShardObject]

@classmethod
def from_message(cls: Type[_T], message: PBShards) -> _T:
as_dict = MessageToDict(
message,
preserving_proto_field_name=True,
including_default_value_fields=True,
)
return cls(**as_dict)
89 changes: 12 additions & 77 deletions nucliadb_models/src/nucliadb_models/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@
#
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Set, Type, TypeVar, Union
from typing import Any, Dict, List, Literal, Optional, Set, TypeVar, Union

from google.protobuf.json_format import MessageToDict
from pydantic import BaseModel, Field, field_validator, model_validator
from pydantic.json_schema import SkipJsonSchema
from typing_extensions import Annotated, Self
Expand All @@ -31,16 +30,24 @@
from nucliadb_models.resource import ExtractedDataTypeName, Resource
from nucliadb_models.security import RequestSecurity
from nucliadb_models.utils import DateTime
from nucliadb_models.vectors import SemanticModelMetadata, VectorSimilarity
from nucliadb_protos.audit_pb2 import ClientType
from nucliadb_protos.nodereader_pb2 import DocumentScored, OrderBy
from nucliadb_protos.nodereader_pb2 import ParagraphResult as PBParagraphResult
from nucliadb_protos.utils_pb2 import RelationNode
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
from nucliadb_protos.writer_pb2 import Shards as PBShards

# Bw/c import to avoid breaking users
from nucliadb_models.internal.predict import Ner, QueryInfo, SentenceSearch, TokenSearch # noqa isort: skip
from nucliadb_models.internal.shards import ( # noqa isort: skip
DocumentServiceEnum,
ParagraphServiceEnum,
VectorServiceEnum,
RelationServiceEnum,
ShardCreated,
ShardObject,
ShardReplica,
KnowledgeboxShards,
)


_T = TypeVar("_T")

Expand Down Expand Up @@ -381,78 +388,6 @@ class KnowledgeBoxCount(BaseModel):
sentences: int


class DocumentServiceEnum(str, Enum):
DOCUMENT_V0 = "DOCUMENT_V0"
DOCUMENT_V1 = "DOCUMENT_V1"
DOCUMENT_V2 = "DOCUMENT_V2"


class ParagraphServiceEnum(str, Enum):
PARAGRAPH_V0 = "PARAGRAPH_V0"
PARAGRAPH_V1 = "PARAGRAPH_V1"
PARAGRAPH_V2 = "PARAGRAPH_V2"
PARAGRAPH_V3 = "PARAGRAPH_V3"


class VectorServiceEnum(str, Enum):
VECTOR_V0 = "VECTOR_V0"
VECTOR_V1 = "VECTOR_V1"


class RelationServiceEnum(str, Enum):
RELATION_V0 = "RELATION_V0"
RELATION_V1 = "RELATION_V1"
RELATION_V2 = "RELATION_V2"


class ShardCreated(BaseModel):
id: str
document_service: DocumentServiceEnum
paragraph_service: ParagraphServiceEnum
vector_service: VectorServiceEnum
relation_service: RelationServiceEnum


class ShardReplica(BaseModel):
node: str
shard: ShardCreated


class ShardObject(BaseModel):
shard: str
replicas: List[ShardReplica]

@classmethod
def from_message(cls: Type[_T], message: PBShardObject) -> _T:
return cls(
**MessageToDict(
message,
preserving_proto_field_name=True,
including_default_value_fields=True,
)
)


class KnowledgeboxShards(BaseModel):
kbid: str
actual: int
similarity: VectorSimilarity
shards: List[ShardObject]
model: Optional[SemanticModelMetadata] = None

@classmethod
def from_message(cls: Type[_T], message: PBShards) -> _T:
as_dict = MessageToDict(
message,
preserving_proto_field_name=True,
including_default_value_fields=True,
)
as_dict["similarity"] = VectorSimilarity.from_message(message.similarity)
if message.HasField("model"):
as_dict["model"] = SemanticModelMetadata.from_message(message.model)
return cls(**as_dict)


class SearchParamDefaults:
query = ParamDefault(default="", title="Query", description="The query to search for")
suggest_query = ParamDefault(
Expand Down

0 comments on commit 88804b1

Please sign in to comment.