From df3465914b3166b620e5342d807488d2ce809d5d Mon Sep 17 00:00:00 2001 From: Joan Antoni RE Date: Mon, 11 Nov 2024 16:52:01 +0100 Subject: [PATCH] Missing shard_suggest module --- nidx/src/searcher/shard_suggest.rs | 215 +++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 nidx/src/searcher/shard_suggest.rs diff --git a/nidx/src/searcher/shard_suggest.rs b/nidx/src/searcher/shard_suggest.rs new file mode 100644 index 0000000000..ab4ae9d79e --- /dev/null +++ b/nidx/src/searcher/shard_suggest.rs @@ -0,0 +1,215 @@ +// Copyright (C) 2021 Bosutech XXI S.L. +// +// nucliadb is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at info@nuclia.com. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// + +use std::{collections::HashSet, sync::Arc}; + +use nidx_paragraph::ParagraphSearcher; +use nidx_protos::{RelationPrefixSearchResponse, SuggestFeatures, SuggestRequest, SuggestResponse}; +use nidx_relation::RelationSearcher; +use nidx_text::{ + prefilter::{PreFilterRequest, ValidFieldCollector}, + TextSearcher, +}; +use nidx_types::query_language::{BooleanExpression, BooleanOperation, Operator, QueryContext}; + +use crate::{ + metadata::{Index, IndexKind}, + NidxMetadata, +}; + +use super::{index_cache::IndexCache, query_language}; + +/// Max number of words accepted as a suggest query. This is useful for +/// compounds with semantic meaning (like a name and a surname) but can add +/// irrelevant words to queries +const MAX_SUGGEST_COMPOUND_WORDS: usize = 3; + +/// Suggest gives possible strings to autocomplete a partial query that's been +/// written. To do so, it searches keyword and relation indexes to find good +/// suggestions. +/// +/// TODO: review implementation. Timestamps are not used and we are probably +/// filtering twice in the prefilter and paragraphs filter +pub async fn suggest( + meta: &NidxMetadata, + index_cache: Arc, + request: SuggestRequest, +) -> anyhow::Result { + let shard_id = uuid::Uuid::parse_str(&request.shard)?; + + // TODO: Avoid querying here, the information can be take from synced metadata + let text_index = Index::find(&meta.pool, shard_id, IndexKind::Text, "text").await?; + let text_searcher_arc = index_cache.get(&text_index.id).await?; + + let paragraph_index = Index::find(&meta.pool, shard_id, IndexKind::Paragraph, "paragraph").await?; + let paragraph_searcher_arc = index_cache.get(¶graph_index.id).await?; + + let relation_index = Index::find(&meta.pool, shard_id, IndexKind::Relation, "relation").await?; + let relation_searcher_arc = index_cache.get(&relation_index.id).await?; + + let suggest_results = tokio::task::spawn_blocking(move || { + blocking_suggest( + request, + text_searcher_arc.as_ref().into(), + paragraph_searcher_arc.as_ref().into(), + relation_searcher_arc.as_ref().into(), + ) + }) + .await??; + + Ok(suggest_results) +} + +fn blocking_suggest( + mut request: SuggestRequest, + text_searcher: &TextSearcher, + paragraph_searcher: &ParagraphSearcher, + relation_searcher: &RelationSearcher, +) -> anyhow::Result { + let mut suggest_paragraphs = request.features.contains(&(SuggestFeatures::Paragraphs as i32)); + let suggest_entities = request.features.contains(&(SuggestFeatures::Entities as i32)); + + let prefixes = split_suggest_query(&request.body, MAX_SUGGEST_COMPOUND_WORDS); + + if let Some(filter) = &mut request.filter { + if !filter.field_labels.is_empty() && suggest_paragraphs { + let labels_formula = if filter.labels_expression.is_empty() { + // Backwards compatibility, take all labels to be AND'ed together + let labels = std::mem::take(&mut filter.field_labels); + let operands = labels.into_iter().map(BooleanExpression::Literal).collect(); + let op = BooleanOperation { + operator: Operator::And, + operands, + }; + + Some(BooleanExpression::Operation(op)) + } else { + // Parse the formula for labels, suggest only supports resource labels + let context = QueryContext { + field_labels: filter.field_labels.iter().cloned().collect(), + paragraph_labels: HashSet::new(), + }; + let analysis = query_language::translate(Some(&filter.labels_expression), None, &context)?; + analysis.labels_prefilter_query + }; + + let prefilter = PreFilterRequest { + timestamp_filters: vec![], + security: None, + labels_formula, + keywords_formula: None, + }; + + let prefiltered = text_searcher.prefilter(&prefilter)?; + + // Apply prefilter to paragraphs query and clear filters + match prefiltered.valid_fields { + ValidFieldCollector::All => {} + ValidFieldCollector::Some(keys) => { + request.key_filters = keys.iter().map(|v| format!("{}{}", v.resource_id, v.field_id)).collect() + } + ValidFieldCollector::None => suggest_paragraphs = false, + } + filter.labels_expression.clear(); + filter.field_labels.clear(); + } + } + + let paragraph_task = suggest_paragraphs.then_some(move || paragraph_searcher.suggest(&request)); + + let relation_task = suggest_entities.then_some(move || relation_searcher.suggest(prefixes)); + + let mut rparagraph = None; + let mut rrelation = None; + + std::thread::scope(|scope| { + if let Some(task) = paragraph_task { + let rparagraph = &mut rparagraph; + scope.spawn(move || *rparagraph = Some(task())); + } + + if let Some(task) = relation_task { + let rrelation = &mut rrelation; + scope.spawn(move || *rrelation = Some(task())); + } + }); + + // Build suggest response from paragraph and relation results + + let mut response = SuggestResponse::default(); + + if let Some(paragraph_response) = rparagraph { + let paragraph_response = paragraph_response?; + response.query = paragraph_response.query; + response.total = paragraph_response.total; + response.results = paragraph_response.results; + response.ematches = paragraph_response.ematches; + } + + if let Some(entities) = rrelation { + response.entity_results = Some(RelationPrefixSearchResponse { + nodes: entities, + }); + } + + Ok(response) +} + +/// Given a query, return a list of derived queries using word(s) from the end +/// of the original query. +/// +/// The longer query, i.e., the one with more words, will come first. That's the +/// one with more probability to get a meaningful suggestion. +/// +/// `max_group` defines the limit of words a query can have. +fn split_suggest_query(query: &str, max_group: usize) -> Vec { + // Paying the price of allocating the vector to not have to + // prepend to the partial strings. + let relevant_words: Vec<_> = query.split(' ').rev().take(max_group).collect(); + let mut prefixes = vec![String::new(); max_group]; + for (index, word) in relevant_words.into_iter().rev().enumerate() { + // The inner loop is upper-bounded by max_group + for prefix in prefixes.iter_mut().take(index + 1) { + if !prefix.is_empty() { + prefix.push(' '); + } + prefix.push_str(word); + } + } + prefixes +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_suggest_split() { + let query = "what are the best use cases for Apache Cassandra".to_string(); + + let expected = vec!["for Apache Cassandra", "Apache Cassandra", "Cassandra"]; + let got = split_suggest_query(&query, 3); + assert_eq!(expected, got); + + let expected = vec!["Apache Cassandra", "Cassandra"]; + let got = split_suggest_query(&query, 2); + assert_eq!(expected, got); + } +}