Skip to content

Commit

Permalink
Improve keyword filtering to use the same tokeinzer as when indexing (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
lferran authored Sep 12, 2024
1 parent 7d7a05a commit 0ac13b9
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 32 deletions.
11 changes: 0 additions & 11 deletions nucliadb/src/nucliadb/search/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,6 @@ def __init__(
if len(self.label_filters) > 0:
self.label_filters = translate_label_filters(self.label_filters)
self.flat_label_filters = flatten_filter_literals(self.label_filters)
if len(self.keyword_filters) > 0:
validate_keyword_filters(self.keyword_filters)
self.max_tokens = max_tokens

@property
Expand Down Expand Up @@ -803,12 +801,3 @@ async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optio
matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension

return matryoshka_dimension


def validate_keyword_filters(keyword_filters: dict[str, Any]):
for literal in flatten_filter_literals(keyword_filters):
if not literal.replace(" ", "").isalnum():
raise InvalidQueryError(
"keyword_filters",
"Only alphanumeric strings with spaces are allowed in keyword filters",
)
29 changes: 11 additions & 18 deletions nucliadb/tests/nucliadb/integration/test_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ async def test_find_keyword_filters(
f"/kb/{kbid}/resources",
json={
"title": "Friedrich Nietzsche. Beyond Good and Evil",
"summary": "The book is a treatise on the nature of morality and ethics. It was written by Friedrich Nietzsche.",
"summary": "[SKU-123:4] The book is a treatise on the nature of morality and ethics. It was written by Friedrich Nietzsche.",
"icon": "text/plain",
},
)
Expand All @@ -399,7 +399,7 @@ async def test_find_keyword_filters(
f"/kb/{kbid}/resources",
json={
"title": "Immanuel Kant. Critique of Pure Reason",
"summary": "The book is a treatise on metaphysics. It was written by Immanuel Kant.",
"summary": "[SKU-567:8] The book is a treatise on metaphysics. It was written by Immanuel Kant.",
"icon": "text/plain",
},
)
Expand Down Expand Up @@ -440,6 +440,15 @@ async def test_find_keyword_filters(
],
[nietzsche_rid, kant_rid],
),
# Searching with ids that contain punctuation characters should work
(
["SKU-123:4"],
[nietzsche_rid],
),
(
["SKU-567:8"],
[kant_rid],
),
# Negative tests (no results expected)
(["Focault"], []), # Keyword not present
(["Nietz"], []), # Partial matches
Expand All @@ -462,19 +471,3 @@ async def test_find_keyword_filters(
assert (
rid in body["resources"]
), f"Keyword filters: {keyword_filters}, expected rids: {expected_rids}"


async def test_find_keyword_filters_validation(nucliadb_reader: AsyncClient):
for invalid_character in ".,:)([]{}-_^%&$#":
resp = await nucliadb_reader.post(
"/kb/kbid/find",
json={
"query": "treatise",
"keyword_filters": [f"Foo{invalid_character}Bar"],
},
)
assert resp.status_code == 412
assert (
"Only alphanumeric strings with spaces are allowed in keyword filters"
in resp.json()["detail"]
)
12 changes: 9 additions & 3 deletions nucliadb_texts2/src/query_io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::schema::TextSchema;
use nucliadb_core::query_language::{BooleanExpression, BooleanOperation, Operator};
use tantivy::query::{AllQuery, BooleanQuery, Occur, PhraseQuery, Query, TermQuery};
use tantivy::schema::{Facet, IndexRecordOption};
use tantivy::tokenizer::TokenizerManager;
use tantivy::Term;

fn translate_label_to_facet_query(literal: &str, schema: &TextSchema) -> Box<dyn Query> {
Expand All @@ -30,9 +31,14 @@ fn translate_label_to_facet_query(literal: &str, schema: &TextSchema) -> Box<dyn
}

fn translate_keyword_to_text_query(literal: &str, schema: &TextSchema) -> Box<dyn Query> {
let terms: Vec<Term> =
literal.split_whitespace().map(|w| Term::from_field_text(schema.text, &w.to_lowercase())).collect();

// Tokenize the literal in the same way we tokenize the text field at indexing time
let tokenizer = TokenizerManager::default().get("default").unwrap();
let mut token_stream = tokenizer.token_stream(literal);
let mut terms = Vec::new();
while let Some(token) = token_stream.next() {
terms.push(Term::from_field_text(schema.text, &token.text));
}
// Create a query using the tokenized terms
if terms.len() == 1 {
Box::new(TermQuery::new(terms[0].clone(), IndexRecordOption::Basic))
} else {
Expand Down

0 comments on commit 0ac13b9

Please sign in to comment.