Skip to content

Commit

Permalink
Missing shard_suggest module
Browse files Browse the repository at this point in the history
  • Loading branch information
jotare committed Nov 11, 2024
1 parent ce71e01 commit df34659
Showing 1 changed file with 215 additions and 0 deletions.
215 changes: 215 additions & 0 deletions nidx/src/searcher/shard_suggest.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
// Copyright (C) 2021 Bosutech XXI S.L.
//
// nucliadb is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at [email protected].
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//

use std::{collections::HashSet, sync::Arc};

use nidx_paragraph::ParagraphSearcher;
use nidx_protos::{RelationPrefixSearchResponse, SuggestFeatures, SuggestRequest, SuggestResponse};
use nidx_relation::RelationSearcher;
use nidx_text::{
prefilter::{PreFilterRequest, ValidFieldCollector},
TextSearcher,
};
use nidx_types::query_language::{BooleanExpression, BooleanOperation, Operator, QueryContext};

use crate::{
metadata::{Index, IndexKind},
NidxMetadata,
};

use super::{index_cache::IndexCache, query_language};

/// Max number of words accepted as a suggest query. This is useful for
/// compounds with semantic meaning (like a name and a surname) but can add
/// irrelevant words to queries
const MAX_SUGGEST_COMPOUND_WORDS: usize = 3;

/// Suggest gives possible strings to autocomplete a partial query that's been
/// written. To do so, it searches keyword and relation indexes to find good
/// suggestions.
///
/// TODO: review implementation. Timestamps are not used and we are probably
/// filtering twice in the prefilter and paragraphs filter
pub async fn suggest(
meta: &NidxMetadata,
index_cache: Arc<IndexCache>,
request: SuggestRequest,
) -> anyhow::Result<SuggestResponse> {
let shard_id = uuid::Uuid::parse_str(&request.shard)?;

// TODO: Avoid querying here, the information can be take from synced metadata
let text_index = Index::find(&meta.pool, shard_id, IndexKind::Text, "text").await?;
let text_searcher_arc = index_cache.get(&text_index.id).await?;

let paragraph_index = Index::find(&meta.pool, shard_id, IndexKind::Paragraph, "paragraph").await?;
let paragraph_searcher_arc = index_cache.get(&paragraph_index.id).await?;

let relation_index = Index::find(&meta.pool, shard_id, IndexKind::Relation, "relation").await?;
let relation_searcher_arc = index_cache.get(&relation_index.id).await?;

let suggest_results = tokio::task::spawn_blocking(move || {
blocking_suggest(
request,
text_searcher_arc.as_ref().into(),
paragraph_searcher_arc.as_ref().into(),
relation_searcher_arc.as_ref().into(),
)
})
.await??;

Ok(suggest_results)
}

fn blocking_suggest(
mut request: SuggestRequest,
text_searcher: &TextSearcher,
paragraph_searcher: &ParagraphSearcher,
relation_searcher: &RelationSearcher,
) -> anyhow::Result<SuggestResponse> {
let mut suggest_paragraphs = request.features.contains(&(SuggestFeatures::Paragraphs as i32));
let suggest_entities = request.features.contains(&(SuggestFeatures::Entities as i32));

let prefixes = split_suggest_query(&request.body, MAX_SUGGEST_COMPOUND_WORDS);

if let Some(filter) = &mut request.filter {
if !filter.field_labels.is_empty() && suggest_paragraphs {
let labels_formula = if filter.labels_expression.is_empty() {
// Backwards compatibility, take all labels to be AND'ed together
let labels = std::mem::take(&mut filter.field_labels);
let operands = labels.into_iter().map(BooleanExpression::Literal).collect();
let op = BooleanOperation {
operator: Operator::And,
operands,
};

Some(BooleanExpression::Operation(op))
} else {
// Parse the formula for labels, suggest only supports resource labels
let context = QueryContext {
field_labels: filter.field_labels.iter().cloned().collect(),
paragraph_labels: HashSet::new(),
};
let analysis = query_language::translate(Some(&filter.labels_expression), None, &context)?;
analysis.labels_prefilter_query
};

let prefilter = PreFilterRequest {
timestamp_filters: vec![],
security: None,
labels_formula,
keywords_formula: None,
};

let prefiltered = text_searcher.prefilter(&prefilter)?;

// Apply prefilter to paragraphs query and clear filters
match prefiltered.valid_fields {
ValidFieldCollector::All => {}
ValidFieldCollector::Some(keys) => {
request.key_filters = keys.iter().map(|v| format!("{}{}", v.resource_id, v.field_id)).collect()
}
ValidFieldCollector::None => suggest_paragraphs = false,
}
filter.labels_expression.clear();
filter.field_labels.clear();
}
}

let paragraph_task = suggest_paragraphs.then_some(move || paragraph_searcher.suggest(&request));

let relation_task = suggest_entities.then_some(move || relation_searcher.suggest(prefixes));

let mut rparagraph = None;
let mut rrelation = None;

std::thread::scope(|scope| {
if let Some(task) = paragraph_task {
let rparagraph = &mut rparagraph;
scope.spawn(move || *rparagraph = Some(task()));
}

if let Some(task) = relation_task {
let rrelation = &mut rrelation;
scope.spawn(move || *rrelation = Some(task()));
}
});

// Build suggest response from paragraph and relation results

let mut response = SuggestResponse::default();

if let Some(paragraph_response) = rparagraph {
let paragraph_response = paragraph_response?;
response.query = paragraph_response.query;
response.total = paragraph_response.total;
response.results = paragraph_response.results;
response.ematches = paragraph_response.ematches;
}

if let Some(entities) = rrelation {
response.entity_results = Some(RelationPrefixSearchResponse {
nodes: entities,
});
}

Ok(response)
}

/// Given a query, return a list of derived queries using word(s) from the end
/// of the original query.
///
/// The longer query, i.e., the one with more words, will come first. That's the
/// one with more probability to get a meaningful suggestion.
///
/// `max_group` defines the limit of words a query can have.
fn split_suggest_query(query: &str, max_group: usize) -> Vec<String> {
// Paying the price of allocating the vector to not have to
// prepend to the partial strings.
let relevant_words: Vec<_> = query.split(' ').rev().take(max_group).collect();
let mut prefixes = vec![String::new(); max_group];
for (index, word) in relevant_words.into_iter().rev().enumerate() {
// The inner loop is upper-bounded by max_group
for prefix in prefixes.iter_mut().take(index + 1) {
if !prefix.is_empty() {
prefix.push(' ');
}
prefix.push_str(word);
}
}
prefixes
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_suggest_split() {
let query = "what are the best use cases for Apache Cassandra".to_string();

let expected = vec!["for Apache Cassandra", "Apache Cassandra", "Cassandra"];
let got = split_suggest_query(&query, 3);
assert_eq!(expected, got);

let expected = vec!["Apache Cassandra", "Cassandra"];
let got = split_suggest_query(&query, 2);
assert_eq!(expected, got);
}
}

0 comments on commit df34659

Please sign in to comment.