-
Notifications
You must be signed in to change notification settings - Fork 51
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
215 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
// Copyright (C) 2021 Bosutech XXI S.L. | ||
// | ||
// nucliadb is offered under the AGPL v3.0 and as commercial software. | ||
// For commercial licensing, contact us at [email protected]. | ||
// | ||
// AGPL: | ||
// This program is free software: you can redistribute it and/or modify | ||
// it under the terms of the GNU Affero General Public License as | ||
// published by the Free Software Foundation, either version 3 of the | ||
// License, or (at your option) any later version. | ||
// | ||
// This program is distributed in the hope that it will be useful, | ||
// but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
// GNU Affero General Public License for more details. | ||
// | ||
// You should have received a copy of the GNU Affero General Public License | ||
// along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
// | ||
|
||
use std::{collections::HashSet, sync::Arc}; | ||
|
||
use nidx_paragraph::ParagraphSearcher; | ||
use nidx_protos::{RelationPrefixSearchResponse, SuggestFeatures, SuggestRequest, SuggestResponse}; | ||
use nidx_relation::RelationSearcher; | ||
use nidx_text::{ | ||
prefilter::{PreFilterRequest, ValidFieldCollector}, | ||
TextSearcher, | ||
}; | ||
use nidx_types::query_language::{BooleanExpression, BooleanOperation, Operator, QueryContext}; | ||
|
||
use crate::{ | ||
metadata::{Index, IndexKind}, | ||
NidxMetadata, | ||
}; | ||
|
||
use super::{index_cache::IndexCache, query_language}; | ||
|
||
/// Max number of words accepted as a suggest query. This is useful for | ||
/// compounds with semantic meaning (like a name and a surname) but can add | ||
/// irrelevant words to queries | ||
const MAX_SUGGEST_COMPOUND_WORDS: usize = 3; | ||
|
||
/// Suggest gives possible strings to autocomplete a partial query that's been | ||
/// written. To do so, it searches keyword and relation indexes to find good | ||
/// suggestions. | ||
/// | ||
/// TODO: review implementation. Timestamps are not used and we are probably | ||
/// filtering twice in the prefilter and paragraphs filter | ||
pub async fn suggest( | ||
meta: &NidxMetadata, | ||
index_cache: Arc<IndexCache>, | ||
request: SuggestRequest, | ||
) -> anyhow::Result<SuggestResponse> { | ||
let shard_id = uuid::Uuid::parse_str(&request.shard)?; | ||
|
||
// TODO: Avoid querying here, the information can be take from synced metadata | ||
let text_index = Index::find(&meta.pool, shard_id, IndexKind::Text, "text").await?; | ||
let text_searcher_arc = index_cache.get(&text_index.id).await?; | ||
|
||
let paragraph_index = Index::find(&meta.pool, shard_id, IndexKind::Paragraph, "paragraph").await?; | ||
let paragraph_searcher_arc = index_cache.get(¶graph_index.id).await?; | ||
|
||
let relation_index = Index::find(&meta.pool, shard_id, IndexKind::Relation, "relation").await?; | ||
let relation_searcher_arc = index_cache.get(&relation_index.id).await?; | ||
|
||
let suggest_results = tokio::task::spawn_blocking(move || { | ||
blocking_suggest( | ||
request, | ||
text_searcher_arc.as_ref().into(), | ||
paragraph_searcher_arc.as_ref().into(), | ||
relation_searcher_arc.as_ref().into(), | ||
) | ||
}) | ||
.await??; | ||
|
||
Ok(suggest_results) | ||
} | ||
|
||
fn blocking_suggest( | ||
mut request: SuggestRequest, | ||
text_searcher: &TextSearcher, | ||
paragraph_searcher: &ParagraphSearcher, | ||
relation_searcher: &RelationSearcher, | ||
) -> anyhow::Result<SuggestResponse> { | ||
let mut suggest_paragraphs = request.features.contains(&(SuggestFeatures::Paragraphs as i32)); | ||
let suggest_entities = request.features.contains(&(SuggestFeatures::Entities as i32)); | ||
|
||
let prefixes = split_suggest_query(&request.body, MAX_SUGGEST_COMPOUND_WORDS); | ||
|
||
if let Some(filter) = &mut request.filter { | ||
if !filter.field_labels.is_empty() && suggest_paragraphs { | ||
let labels_formula = if filter.labels_expression.is_empty() { | ||
// Backwards compatibility, take all labels to be AND'ed together | ||
let labels = std::mem::take(&mut filter.field_labels); | ||
let operands = labels.into_iter().map(BooleanExpression::Literal).collect(); | ||
let op = BooleanOperation { | ||
operator: Operator::And, | ||
operands, | ||
}; | ||
|
||
Some(BooleanExpression::Operation(op)) | ||
} else { | ||
// Parse the formula for labels, suggest only supports resource labels | ||
let context = QueryContext { | ||
field_labels: filter.field_labels.iter().cloned().collect(), | ||
paragraph_labels: HashSet::new(), | ||
}; | ||
let analysis = query_language::translate(Some(&filter.labels_expression), None, &context)?; | ||
analysis.labels_prefilter_query | ||
}; | ||
|
||
let prefilter = PreFilterRequest { | ||
timestamp_filters: vec![], | ||
security: None, | ||
labels_formula, | ||
keywords_formula: None, | ||
}; | ||
|
||
let prefiltered = text_searcher.prefilter(&prefilter)?; | ||
|
||
// Apply prefilter to paragraphs query and clear filters | ||
match prefiltered.valid_fields { | ||
ValidFieldCollector::All => {} | ||
ValidFieldCollector::Some(keys) => { | ||
request.key_filters = keys.iter().map(|v| format!("{}{}", v.resource_id, v.field_id)).collect() | ||
} | ||
ValidFieldCollector::None => suggest_paragraphs = false, | ||
} | ||
filter.labels_expression.clear(); | ||
filter.field_labels.clear(); | ||
} | ||
} | ||
|
||
let paragraph_task = suggest_paragraphs.then_some(move || paragraph_searcher.suggest(&request)); | ||
|
||
let relation_task = suggest_entities.then_some(move || relation_searcher.suggest(prefixes)); | ||
|
||
let mut rparagraph = None; | ||
let mut rrelation = None; | ||
|
||
std::thread::scope(|scope| { | ||
if let Some(task) = paragraph_task { | ||
let rparagraph = &mut rparagraph; | ||
scope.spawn(move || *rparagraph = Some(task())); | ||
} | ||
|
||
if let Some(task) = relation_task { | ||
let rrelation = &mut rrelation; | ||
scope.spawn(move || *rrelation = Some(task())); | ||
} | ||
}); | ||
|
||
// Build suggest response from paragraph and relation results | ||
|
||
let mut response = SuggestResponse::default(); | ||
|
||
if let Some(paragraph_response) = rparagraph { | ||
let paragraph_response = paragraph_response?; | ||
response.query = paragraph_response.query; | ||
response.total = paragraph_response.total; | ||
response.results = paragraph_response.results; | ||
response.ematches = paragraph_response.ematches; | ||
} | ||
|
||
if let Some(entities) = rrelation { | ||
response.entity_results = Some(RelationPrefixSearchResponse { | ||
nodes: entities, | ||
}); | ||
} | ||
|
||
Ok(response) | ||
} | ||
|
||
/// Given a query, return a list of derived queries using word(s) from the end | ||
/// of the original query. | ||
/// | ||
/// The longer query, i.e., the one with more words, will come first. That's the | ||
/// one with more probability to get a meaningful suggestion. | ||
/// | ||
/// `max_group` defines the limit of words a query can have. | ||
fn split_suggest_query(query: &str, max_group: usize) -> Vec<String> { | ||
// Paying the price of allocating the vector to not have to | ||
// prepend to the partial strings. | ||
let relevant_words: Vec<_> = query.split(' ').rev().take(max_group).collect(); | ||
let mut prefixes = vec![String::new(); max_group]; | ||
for (index, word) in relevant_words.into_iter().rev().enumerate() { | ||
// The inner loop is upper-bounded by max_group | ||
for prefix in prefixes.iter_mut().take(index + 1) { | ||
if !prefix.is_empty() { | ||
prefix.push(' '); | ||
} | ||
prefix.push_str(word); | ||
} | ||
} | ||
prefixes | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_suggest_split() { | ||
let query = "what are the best use cases for Apache Cassandra".to_string(); | ||
|
||
let expected = vec!["for Apache Cassandra", "Apache Cassandra", "Cassandra"]; | ||
let got = split_suggest_query(&query, 3); | ||
assert_eq!(expected, got); | ||
|
||
let expected = vec!["Apache Cassandra", "Cassandra"]; | ||
let got = split_suggest_query(&query, 2); | ||
assert_eq!(expected, got); | ||
} | ||
} |