Nidx search test (#2619)

* Migrate date filter test * Migrate more search tests * Copy remaining search tests: pending to implement suggest * Port suggest to nidx * Missing shard_suggest module * Add basic tests for shards with/without vectorsets * Implement add and list vectorsets * Remove unneeded allow * Remove rayon from relations and don't parallelize sugggest * Implement remove vectorsets * cargo fmt * Change error string Co-authored-by: Javier Torres <[email protected]> --------- Co-authored-by: Joan Antoni RE <[email protected]>
nuclia · Nov 12, 2024 · 201f6bd · 201f6bd
1 parent c29bd5f
commit 201f6bd
Show file tree

Hide file tree

Showing 24 changed files with 2,725 additions and 31 deletions.
diff --git a/nidx/Cargo.lock b/nidx/Cargo.lock
diff --git a/nidx/Cargo.toml b/nidx/Cargo.toml
@@ -36,5 +36,6 @@ tower = "0.5.1"
 http = "1.1.0"
 
 [dev-dependencies]
+itertools = "0.13.0"
 nidx_tests = { path = "nidx_tests" }
 rand = "0.8.5"
diff --git a/nidx/nidx_paragraph/src/lib.rs b/nidx/nidx_paragraph/src/lib.rs
@@ -87,7 +87,7 @@ impl ParagraphIndexer {
         let mut indexer = TantivyIndexer::new(output_dir.to_path_buf(), field_schema.schema.clone())?;
 
         index_paragraphs(&mut indexer, resource, field_schema)?;
-        Ok(Some(indexer.finalize()?))
+        indexer.finalize()
     }
 
     pub fn deletions_for_resource(&self, resource: &nidx_protos::Resource) -> Vec<String> {

diff --git a/nidx/nidx_relation/src/lib.rs b/nidx/nidx_relation/src/lib.rs
@@ -23,16 +23,19 @@ mod reader;
 mod resource_indexer;
 mod schema;
 
-use nidx_protos::{resource::ResourceStatus, RelationSearchRequest, RelationSearchResponse};
+use nidx_protos::{
+    relation_node::NodeType, resource::ResourceStatus, RelationNode, RelationNodeFilter, RelationPrefixSearchRequest,
+    RelationSearchRequest, RelationSearchResponse,
+};
 use nidx_tantivy::{
     index_reader::{open_index_with_deletions, DeletionQueryBuilder},
     TantivyIndexer, TantivyMeta, TantivySegmentMetadata,
 };
 use nidx_types::OpenIndexMetadata;
-use reader::RelationsReaderService;
+use reader::{HashedRelationNode, RelationsReaderService};
 use resource_indexer::index_relations;
 pub use schema::Schema as RelationSchema;
-use std::path::Path;
+use std::{collections::HashSet, path::Path};
 use tantivy::{
     directory::MmapDirectory,
     indexer::merge_indices,
@@ -41,6 +44,11 @@ use tantivy::{
     Term,
 };
 
+/// Minimum length for a word to be accepted as a entity to search for
+/// suggestions. Low values can provide too much noise and higher ones can
+/// remove important words from suggestion
+const MIN_SUGGEST_PREFIX_LENGTH: usize = 2;
+
 pub struct RelationIndexer;
 
 pub struct RelationDeletionQueryBuilder(Field);
@@ -69,7 +77,7 @@ impl RelationIndexer {
         }
 
         index_relations(&mut indexer, resource, field_schema)?;
-        Ok(Some(indexer.finalize()?))
+        indexer.finalize()
     }
 
     pub fn deletions_for_resource(&self, resource: &nidx_protos::Resource) -> Vec<String> {
@@ -120,4 +128,36 @@ impl RelationSearcher {
     pub fn search(&self, request: &RelationSearchRequest) -> anyhow::Result<RelationSearchResponse> {
         self.reader.search(request)
     }
+
+    pub fn suggest(&self, prefixes: Vec<String>) -> Vec<RelationNode> {
+        let requests =
+            prefixes.iter().filter(|prefix| prefix.len() >= MIN_SUGGEST_PREFIX_LENGTH).cloned().map(|prefix| {
+                RelationSearchRequest {
+                    prefix: Some(RelationPrefixSearchRequest {
+                        prefix,
+                        node_filters: vec![RelationNodeFilter {
+                            node_type: NodeType::Entity.into(),
+                            ..Default::default()
+                        }],
+                    }),
+                    ..Default::default()
+                }
+            });
+
+        let responses: Vec<_> = requests.map(|request| self.search(&request)).collect();
+
+        // REVIEW: we are ignoring errors on search, we may want to, at least, log something
+        let entities = responses
+            .into_iter()
+            .flatten() // unwrap errors and continue with successful results
+            .flat_map(|response| response.prefix)
+            .flat_map(|prefix_response| prefix_response.nodes.into_iter());
+
+        // remove duplicate entities
+        let mut seen: HashSet<HashedRelationNode> = HashSet::new();
+        let mut ent_result = entities.collect::<Vec<_>>();
+        ent_result.retain(|e| seen.insert(e.clone().into()));
+
+        ent_result
+    }
 }
diff --git a/nidx/nidx_tantivy/src/lib.rs b/nidx/nidx_tantivy/src/lib.rs
@@ -43,6 +43,7 @@ pub type TantivySegmentMetadata = SegmentMetadata<TantivyMeta>;
 pub struct TantivyIndexer {
     writer: SingleSegmentIndexWriter,
     output_path: PathBuf,
+    has_documents: bool,
 }
 
 impl TantivyIndexer {
@@ -52,25 +53,30 @@ impl TantivyIndexer {
         Ok(Self {
             writer,
             output_path: output_dir,
+            has_documents: false,
         })
     }
 
     pub fn add_document(&mut self, doc: TantivyDocument) -> tantivy::Result<()> {
+        self.has_documents = true;
         self.writer.add_document(doc)
     }
 
-    pub fn finalize(self) -> anyhow::Result<SegmentMetadata<TantivyMeta>> {
+    pub fn finalize(self) -> anyhow::Result<Option<SegmentMetadata<TantivyMeta>>> {
+        if !self.has_documents {
+            return Ok(None);
+        }
         let index = self.writer.finalize()?;
         let segments = index.searchable_segment_metas()?;
         assert_eq!(segments.len(), 1);
         let segment = &segments[0];
 
-        Ok(SegmentMetadata {
+        Ok(Some(SegmentMetadata {
             path: self.output_path,
             records: segment.max_doc() as usize,
             index_metadata: TantivyMeta {
                 segment_id: segment.id().uuid_string(),
             },
-        })
+        }))
     }
 }
diff --git a/nidx/nidx_tests/src/lib.rs b/nidx/nidx_tests/src/lib.rs
@@ -21,6 +21,8 @@ use std::{collections::HashMap, time::SystemTime};
 
 use nidx_protos::prost_types::Timestamp;
 use nidx_protos::*;
+use relation::RelationType;
+use relation_node::NodeType;
 use uuid::Uuid;
 
 pub fn minimal_resource(shard_id: String) -> Resource {
@@ -131,3 +133,133 @@ pub fn little_prince(shard_id: impl Into<String>, vectorsets: Option<&[&str]>) -
 
     resource
 }
+
+pub fn people_and_places(shard_id: impl Into<String>) -> Resource {
+    let shard_id = shard_id.into();
+
+    let mut resource = minimal_resource(shard_id);
+    let rid = &resource.resource.as_ref().unwrap().uuid;
+
+    resource.texts.insert(
+        format!("{}/title", resource.resource.as_ref().unwrap().uuid),
+        TextInformation {
+            text: "People and places".to_string(),
+            ..Default::default()
+        },
+    );
+    resource.texts.insert(
+        format!("{}/summary", resource.resource.as_ref().unwrap().uuid),
+        TextInformation {
+            text: "Test entities to validate suggest on relations index".to_string(),
+            ..Default::default()
+        },
+    );
+
+    let resource_node = RelationNode {
+        value: rid.clone(),
+        ntype: NodeType::Resource as i32,
+        subtype: String::new(),
+    };
+
+    let collaborators = ["Anastasia", "Irene"].into_iter().map(|collaborator| RelationNode {
+        value: collaborator.to_string(),
+        ntype: NodeType::User as i32,
+        subtype: "".to_string(),
+    });
+
+    let people = ["Anna", "Anthony", "Bárcenas", "Ben", "John"].into_iter().map(|person| RelationNode {
+        value: person.to_string(),
+        ntype: NodeType::Entity as i32,
+        subtype: "person".to_string(),
+    });
+
+    let cities = ["Barcelona", "New York", "York"].into_iter().map(|city| RelationNode {
+        value: city.to_string(),
+        ntype: NodeType::Entity as i32,
+        subtype: "city".to_string(),
+    });
+
+    let countries = ["Israel", "Netherlands", "Solomon Islands"].into_iter().map(|country| RelationNode {
+        value: country.to_string(),
+        ntype: NodeType::Entity as i32,
+        subtype: "country".to_string(),
+    });
+
+    let entities = people.chain(cities).chain(countries);
+
+    let mut relations = vec![];
+    relations.extend(collaborators.map(|node| Relation {
+        relation: RelationType::Colab as i32,
+        source: Some(resource_node.clone()),
+        to: Some(node),
+        ..Default::default()
+    }));
+    relations.extend(entities.map(|node| Relation {
+        relation: RelationType::Entity as i32,
+        source: Some(resource_node.clone()),
+        to: Some(node),
+        ..Default::default()
+    }));
+
+    resource.relations.extend(relations);
+
+    resource
+}
+
+pub fn thus_spoke_zarathustra(shard_id: impl Into<String>) -> Resource {
+    let shard_id = shard_id.into();
+    let mut resource = minimal_resource(shard_id);
+    let rid = &resource.resource.as_ref().unwrap().uuid;
+
+    resource.labels.push("/s/p/de".to_string()); // language=de
+
+    resource.texts.insert(
+        "a/title".to_string(),
+        TextInformation {
+            text: "Thus Spoke Zarathustra".to_string(),
+            ..Default::default()
+        },
+    );
+    let mut title_paragraphs = HashMap::new();
+    title_paragraphs.insert(
+        format!("{rid}/a/title/0-22"),
+        IndexParagraph {
+            start: 0,
+            end: 22,
+            field: "a/title".to_string(),
+            ..Default::default()
+        },
+    );
+    resource.paragraphs.insert(
+        "a/title".to_string(),
+        IndexParagraphs {
+            paragraphs: title_paragraphs,
+        },
+    );
+
+    resource.texts.insert(
+        "a/summary".to_string(),
+        TextInformation {
+            text: "Philosophical book written by Frederich Nietzche".to_string(),
+            ..Default::default()
+        },
+    );
+    let mut summary_paragraphs = HashMap::new();
+    summary_paragraphs.insert(
+        format!("{rid}/a/summary/0-48"),
+        IndexParagraph {
+            start: 0,
+            end: 48,
+            field: "a/summary".to_string(),
+            ..Default::default()
+        },
+    );
+    resource.paragraphs.insert(
+        "a/summary".to_string(),
+        IndexParagraphs {
+            paragraphs: summary_paragraphs,
+        },
+    );
+
+    resource
+}
diff --git a/nidx/nidx_text/src/lib.rs b/nidx/nidx_text/src/lib.rs
@@ -64,7 +64,7 @@ impl TextIndexer {
         let mut indexer = TantivyIndexer::new(output_dir.to_path_buf(), field_schema.schema.clone())?;
 
         index_document(&mut indexer, resource, field_schema)?;
-        Ok(Some(indexer.finalize()?))
+        indexer.finalize()
     }
 
     pub fn deletions_for_resource(&self, resource: &nidx_protos::Resource) -> Vec<String> {

diff --git a/nidx/src/api/grpc.rs b/nidx/src/api/grpc.rs
@@ -23,7 +23,7 @@ use std::str::FromStr;
 
 use crate::errors::NidxError;
 use crate::grpc_server::RemappedGrpcService;
-use crate::metadata::{IndexKind, Shard};
+use crate::metadata::{Index, IndexKind, Shard};
 use nidx_protos::nidx::nidx_api_server::*;
 use nidx_protos::*;
 use nidx_vector::config::VectorConfig;
@@ -74,6 +74,7 @@ impl NidxApi for ApiServer {
             sentences: *index_stats.get(&IndexKind::Vector).unwrap_or(&0) as u64,
         }))
     }
+
     async fn new_shard(&self, request: Request<NewShardRequest>) -> Result<Response<ShardCreated>> {
         // TODO? analytics event
         let request = request.into_inner();
@@ -117,16 +118,67 @@ impl NidxApi for ApiServer {
         }))
     }
 
-    async fn add_vector_set(&self, _request: Request<NewVectorSetRequest>) -> Result<Response<OpStatus>> {
-        todo!()
+    async fn add_vector_set(&self, request: Request<NewVectorSetRequest>) -> Result<Response<OpStatus>> {
+        let request = request.into_inner();
+        let Some(VectorSetId {
+            shard: Some(ShardId {
+                id: ref shard_id,
+            }),
+            ref vectorset,
+        }) = request.id
+        else {
+            return Err(NidxError::invalid("Vectorset ID is required").into());
+        };
+        let shard_id = Uuid::from_str(shard_id).map_err(NidxError::from)?;
+        let config = if let Some(config) = request.config {
+            VectorConfig::try_from(config)
+                .map_err(|error| NidxError::invalid(&format!("Invalid vectorset configuration: {error:?}")))?
+        } else {
+            return Err(NidxError::invalid("Vectorset configuration is required").into());
+        };
+
+        Index::create(&self.meta.pool, shard_id, vectorset, config.into()).await.map_err(NidxError::from)?;
+
+        Ok(Response::new(OpStatus {
+            status: op_status::Status::Ok.into(),
+            detail: "Vectorset successfully created".to_string(),
+            ..Default::default()
+        }))
     }
 
-    async fn remove_vector_set(&self, _request: Request<VectorSetId>) -> Result<Response<OpStatus>> {
-        todo!()
+    async fn remove_vector_set(&self, request: Request<VectorSetId>) -> Result<Response<OpStatus>> {
+        let VectorSetId {
+            shard: Some(ShardId {
+                id: ref shard_id,
+            }),
+            ref vectorset,
+        } = request.into_inner()
+        else {
+            return Err(NidxError::invalid("Vectorset ID is required").into());
+        };
+        let shard_id = Uuid::from_str(shard_id).map_err(NidxError::from)?;
+
+        shards::delete_vectorset(&self.meta, shard_id, vectorset).await?;
+
+        Ok(Response::new(OpStatus {
+            status: op_status::Status::Ok.into(),
+            detail: "Vectorset successfully deleted".to_string(),
+            ..Default::default()
+        }))
     }
 
-    async fn list_vector_sets(&self, _request: Request<ShardId>) -> Result<Response<VectorSetList>> {
-        todo!()
+    async fn list_vector_sets(&self, request: Request<ShardId>) -> Result<Response<VectorSetList>> {
+        let request = request.into_inner();
+        let shard_id = Uuid::from_str(&request.id).map_err(NidxError::from)?;
+        // TODO: query only vector indexes
+        let indexes = Index::for_shard(&self.meta.pool, shard_id).await.map_err(NidxError::from)?;
+
+        let vectorsets =
+            indexes.into_iter().filter(|index| index.kind == IndexKind::Vector).map(|index| index.name).collect();
+        Ok(tonic::Response::new(VectorSetList {
+            shard: Some(request),
+            vectorsets,
+        }))
     }
 
     async fn get_metadata(&self, _request: Request<EmptyQuery>) -> Result<Response<NodeMetadata>> {