Skip to content

Commit

Permalink
Nidx search test (#2619)
Browse files Browse the repository at this point in the history
* Migrate date filter test

* Migrate more search tests

* Copy remaining search tests: pending to implement suggest

* Port suggest to nidx

* Missing shard_suggest module

* Add basic tests for shards with/without vectorsets

* Implement add and list vectorsets

* Remove unneeded allow

* Remove rayon from relations and don't parallelize sugggest

* Implement remove vectorsets

* cargo fmt

* Change error string

Co-authored-by: Javier Torres <[email protected]>

---------

Co-authored-by: Joan Antoni RE <[email protected]>
  • Loading branch information
javitonino and jotare authored Nov 12, 2024
1 parent c29bd5f commit 201f6bd
Show file tree
Hide file tree
Showing 24 changed files with 2,725 additions and 31 deletions.
1 change: 1 addition & 0 deletions nidx/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions nidx/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,6 @@ tower = "0.5.1"
http = "1.1.0"

[dev-dependencies]
itertools = "0.13.0"
nidx_tests = { path = "nidx_tests" }
rand = "0.8.5"
2 changes: 1 addition & 1 deletion nidx/nidx_paragraph/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ impl ParagraphIndexer {
let mut indexer = TantivyIndexer::new(output_dir.to_path_buf(), field_schema.schema.clone())?;

index_paragraphs(&mut indexer, resource, field_schema)?;
Ok(Some(indexer.finalize()?))
indexer.finalize()
}

pub fn deletions_for_resource(&self, resource: &nidx_protos::Resource) -> Vec<String> {
Expand Down
48 changes: 44 additions & 4 deletions nidx/nidx_relation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,19 @@ mod reader;
mod resource_indexer;
mod schema;

use nidx_protos::{resource::ResourceStatus, RelationSearchRequest, RelationSearchResponse};
use nidx_protos::{
relation_node::NodeType, resource::ResourceStatus, RelationNode, RelationNodeFilter, RelationPrefixSearchRequest,
RelationSearchRequest, RelationSearchResponse,
};
use nidx_tantivy::{
index_reader::{open_index_with_deletions, DeletionQueryBuilder},
TantivyIndexer, TantivyMeta, TantivySegmentMetadata,
};
use nidx_types::OpenIndexMetadata;
use reader::RelationsReaderService;
use reader::{HashedRelationNode, RelationsReaderService};
use resource_indexer::index_relations;
pub use schema::Schema as RelationSchema;
use std::path::Path;
use std::{collections::HashSet, path::Path};
use tantivy::{
directory::MmapDirectory,
indexer::merge_indices,
Expand All @@ -41,6 +44,11 @@ use tantivy::{
Term,
};

/// Minimum length for a word to be accepted as a entity to search for
/// suggestions. Low values can provide too much noise and higher ones can
/// remove important words from suggestion
const MIN_SUGGEST_PREFIX_LENGTH: usize = 2;

pub struct RelationIndexer;

pub struct RelationDeletionQueryBuilder(Field);
Expand Down Expand Up @@ -69,7 +77,7 @@ impl RelationIndexer {
}

index_relations(&mut indexer, resource, field_schema)?;
Ok(Some(indexer.finalize()?))
indexer.finalize()
}

pub fn deletions_for_resource(&self, resource: &nidx_protos::Resource) -> Vec<String> {
Expand Down Expand Up @@ -120,4 +128,36 @@ impl RelationSearcher {
pub fn search(&self, request: &RelationSearchRequest) -> anyhow::Result<RelationSearchResponse> {
self.reader.search(request)
}

pub fn suggest(&self, prefixes: Vec<String>) -> Vec<RelationNode> {
let requests =
prefixes.iter().filter(|prefix| prefix.len() >= MIN_SUGGEST_PREFIX_LENGTH).cloned().map(|prefix| {
RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
prefix,
node_filters: vec![RelationNodeFilter {
node_type: NodeType::Entity.into(),
..Default::default()
}],
}),
..Default::default()
}
});

let responses: Vec<_> = requests.map(|request| self.search(&request)).collect();

// REVIEW: we are ignoring errors on search, we may want to, at least, log something
let entities = responses
.into_iter()
.flatten() // unwrap errors and continue with successful results
.flat_map(|response| response.prefix)
.flat_map(|prefix_response| prefix_response.nodes.into_iter());

// remove duplicate entities
let mut seen: HashSet<HashedRelationNode> = HashSet::new();
let mut ent_result = entities.collect::<Vec<_>>();
ent_result.retain(|e| seen.insert(e.clone().into()));

ent_result
}
}
12 changes: 9 additions & 3 deletions nidx/nidx_tantivy/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ pub type TantivySegmentMetadata = SegmentMetadata<TantivyMeta>;
pub struct TantivyIndexer {
writer: SingleSegmentIndexWriter,
output_path: PathBuf,
has_documents: bool,
}

impl TantivyIndexer {
Expand All @@ -52,25 +53,30 @@ impl TantivyIndexer {
Ok(Self {
writer,
output_path: output_dir,
has_documents: false,
})
}

pub fn add_document(&mut self, doc: TantivyDocument) -> tantivy::Result<()> {
self.has_documents = true;
self.writer.add_document(doc)
}

pub fn finalize(self) -> anyhow::Result<SegmentMetadata<TantivyMeta>> {
pub fn finalize(self) -> anyhow::Result<Option<SegmentMetadata<TantivyMeta>>> {
if !self.has_documents {
return Ok(None);
}
let index = self.writer.finalize()?;
let segments = index.searchable_segment_metas()?;
assert_eq!(segments.len(), 1);
let segment = &segments[0];

Ok(SegmentMetadata {
Ok(Some(SegmentMetadata {
path: self.output_path,
records: segment.max_doc() as usize,
index_metadata: TantivyMeta {
segment_id: segment.id().uuid_string(),
},
})
}))
}
}
132 changes: 132 additions & 0 deletions nidx/nidx_tests/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ use std::{collections::HashMap, time::SystemTime};

use nidx_protos::prost_types::Timestamp;
use nidx_protos::*;
use relation::RelationType;
use relation_node::NodeType;
use uuid::Uuid;

pub fn minimal_resource(shard_id: String) -> Resource {
Expand Down Expand Up @@ -131,3 +133,133 @@ pub fn little_prince(shard_id: impl Into<String>, vectorsets: Option<&[&str]>) -

resource
}

pub fn people_and_places(shard_id: impl Into<String>) -> Resource {
let shard_id = shard_id.into();

let mut resource = minimal_resource(shard_id);
let rid = &resource.resource.as_ref().unwrap().uuid;

resource.texts.insert(
format!("{}/title", resource.resource.as_ref().unwrap().uuid),
TextInformation {
text: "People and places".to_string(),
..Default::default()
},
);
resource.texts.insert(
format!("{}/summary", resource.resource.as_ref().unwrap().uuid),
TextInformation {
text: "Test entities to validate suggest on relations index".to_string(),
..Default::default()
},
);

let resource_node = RelationNode {
value: rid.clone(),
ntype: NodeType::Resource as i32,
subtype: String::new(),
};

let collaborators = ["Anastasia", "Irene"].into_iter().map(|collaborator| RelationNode {
value: collaborator.to_string(),
ntype: NodeType::User as i32,
subtype: "".to_string(),
});

let people = ["Anna", "Anthony", "Bárcenas", "Ben", "John"].into_iter().map(|person| RelationNode {
value: person.to_string(),
ntype: NodeType::Entity as i32,
subtype: "person".to_string(),
});

let cities = ["Barcelona", "New York", "York"].into_iter().map(|city| RelationNode {
value: city.to_string(),
ntype: NodeType::Entity as i32,
subtype: "city".to_string(),
});

let countries = ["Israel", "Netherlands", "Solomon Islands"].into_iter().map(|country| RelationNode {
value: country.to_string(),
ntype: NodeType::Entity as i32,
subtype: "country".to_string(),
});

let entities = people.chain(cities).chain(countries);

let mut relations = vec![];
relations.extend(collaborators.map(|node| Relation {
relation: RelationType::Colab as i32,
source: Some(resource_node.clone()),
to: Some(node),
..Default::default()
}));
relations.extend(entities.map(|node| Relation {
relation: RelationType::Entity as i32,
source: Some(resource_node.clone()),
to: Some(node),
..Default::default()
}));

resource.relations.extend(relations);

resource
}

pub fn thus_spoke_zarathustra(shard_id: impl Into<String>) -> Resource {
let shard_id = shard_id.into();
let mut resource = minimal_resource(shard_id);
let rid = &resource.resource.as_ref().unwrap().uuid;

resource.labels.push("/s/p/de".to_string()); // language=de

resource.texts.insert(
"a/title".to_string(),
TextInformation {
text: "Thus Spoke Zarathustra".to_string(),
..Default::default()
},
);
let mut title_paragraphs = HashMap::new();
title_paragraphs.insert(
format!("{rid}/a/title/0-22"),
IndexParagraph {
start: 0,
end: 22,
field: "a/title".to_string(),
..Default::default()
},
);
resource.paragraphs.insert(
"a/title".to_string(),
IndexParagraphs {
paragraphs: title_paragraphs,
},
);

resource.texts.insert(
"a/summary".to_string(),
TextInformation {
text: "Philosophical book written by Frederich Nietzche".to_string(),
..Default::default()
},
);
let mut summary_paragraphs = HashMap::new();
summary_paragraphs.insert(
format!("{rid}/a/summary/0-48"),
IndexParagraph {
start: 0,
end: 48,
field: "a/summary".to_string(),
..Default::default()
},
);
resource.paragraphs.insert(
"a/summary".to_string(),
IndexParagraphs {
paragraphs: summary_paragraphs,
},
);

resource
}
2 changes: 1 addition & 1 deletion nidx/nidx_text/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ impl TextIndexer {
let mut indexer = TantivyIndexer::new(output_dir.to_path_buf(), field_schema.schema.clone())?;

index_document(&mut indexer, resource, field_schema)?;
Ok(Some(indexer.finalize()?))
indexer.finalize()
}

pub fn deletions_for_resource(&self, resource: &nidx_protos::Resource) -> Vec<String> {
Expand Down
66 changes: 59 additions & 7 deletions nidx/src/api/grpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use std::str::FromStr;

use crate::errors::NidxError;
use crate::grpc_server::RemappedGrpcService;
use crate::metadata::{IndexKind, Shard};
use crate::metadata::{Index, IndexKind, Shard};
use nidx_protos::nidx::nidx_api_server::*;
use nidx_protos::*;
use nidx_vector::config::VectorConfig;
Expand Down Expand Up @@ -74,6 +74,7 @@ impl NidxApi for ApiServer {
sentences: *index_stats.get(&IndexKind::Vector).unwrap_or(&0) as u64,
}))
}

async fn new_shard(&self, request: Request<NewShardRequest>) -> Result<Response<ShardCreated>> {
// TODO? analytics event
let request = request.into_inner();
Expand Down Expand Up @@ -117,16 +118,67 @@ impl NidxApi for ApiServer {
}))
}

async fn add_vector_set(&self, _request: Request<NewVectorSetRequest>) -> Result<Response<OpStatus>> {
todo!()
async fn add_vector_set(&self, request: Request<NewVectorSetRequest>) -> Result<Response<OpStatus>> {
let request = request.into_inner();
let Some(VectorSetId {
shard: Some(ShardId {
id: ref shard_id,
}),
ref vectorset,
}) = request.id
else {
return Err(NidxError::invalid("Vectorset ID is required").into());
};
let shard_id = Uuid::from_str(shard_id).map_err(NidxError::from)?;
let config = if let Some(config) = request.config {
VectorConfig::try_from(config)
.map_err(|error| NidxError::invalid(&format!("Invalid vectorset configuration: {error:?}")))?
} else {
return Err(NidxError::invalid("Vectorset configuration is required").into());
};

Index::create(&self.meta.pool, shard_id, vectorset, config.into()).await.map_err(NidxError::from)?;

Ok(Response::new(OpStatus {
status: op_status::Status::Ok.into(),
detail: "Vectorset successfully created".to_string(),
..Default::default()
}))
}

async fn remove_vector_set(&self, _request: Request<VectorSetId>) -> Result<Response<OpStatus>> {
todo!()
async fn remove_vector_set(&self, request: Request<VectorSetId>) -> Result<Response<OpStatus>> {
let VectorSetId {
shard: Some(ShardId {
id: ref shard_id,
}),
ref vectorset,
} = request.into_inner()
else {
return Err(NidxError::invalid("Vectorset ID is required").into());
};
let shard_id = Uuid::from_str(shard_id).map_err(NidxError::from)?;

shards::delete_vectorset(&self.meta, shard_id, vectorset).await?;

Ok(Response::new(OpStatus {
status: op_status::Status::Ok.into(),
detail: "Vectorset successfully deleted".to_string(),
..Default::default()
}))
}

async fn list_vector_sets(&self, _request: Request<ShardId>) -> Result<Response<VectorSetList>> {
todo!()
async fn list_vector_sets(&self, request: Request<ShardId>) -> Result<Response<VectorSetList>> {
let request = request.into_inner();
let shard_id = Uuid::from_str(&request.id).map_err(NidxError::from)?;
// TODO: query only vector indexes
let indexes = Index::for_shard(&self.meta.pool, shard_id).await.map_err(NidxError::from)?;

let vectorsets =
indexes.into_iter().filter(|index| index.kind == IndexKind::Vector).map(|index| index.name).collect();
Ok(tonic::Response::new(VectorSetList {
shard: Some(request),
vectorsets,
}))
}

async fn get_metadata(&self, _request: Request<EmptyQuery>) -> Result<Response<NodeMetadata>> {
Expand Down
Loading

0 comments on commit 201f6bd

Please sign in to comment.