Improve keyword filtering to use the same tokeinzer as when indexing (#…

…2451)
nuclia · Sep 12, 2024 · 0ac13b9 · 0ac13b9
1 parent 7d7a05a
commit 0ac13b9
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 32 deletions.
diff --git a/nucliadb/src/nucliadb/search/search/query.py b/nucliadb/src/nucliadb/search/search/query.py
@@ -150,8 +150,6 @@ def __init__(
         if len(self.label_filters) > 0:
             self.label_filters = translate_label_filters(self.label_filters)
             self.flat_label_filters = flatten_filter_literals(self.label_filters)
-        if len(self.keyword_filters) > 0:
-            validate_keyword_filters(self.keyword_filters)
         self.max_tokens = max_tokens
 
     @property
@@ -803,12 +801,3 @@ async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optio
                 matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
 
         return matryoshka_dimension
-
-
-def validate_keyword_filters(keyword_filters: dict[str, Any]):
-    for literal in flatten_filter_literals(keyword_filters):
-        if not literal.replace(" ", "").isalnum():
-            raise InvalidQueryError(
-                "keyword_filters",
-                "Only alphanumeric strings with spaces are allowed in keyword filters",
-            )
diff --git a/nucliadb/tests/nucliadb/integration/test_find.py b/nucliadb/tests/nucliadb/integration/test_find.py
@@ -388,7 +388,7 @@ async def test_find_keyword_filters(
         f"/kb/{kbid}/resources",
         json={
             "title": "Friedrich Nietzsche. Beyond Good and Evil",
-            "summary": "The book is a treatise on the nature of morality and ethics. It was written by Friedrich Nietzsche.",
+            "summary": "[SKU-123:4] The book is a treatise on the nature of morality and ethics. It was written by Friedrich Nietzsche.",
             "icon": "text/plain",
         },
     )
@@ -399,7 +399,7 @@ async def test_find_keyword_filters(
         f"/kb/{kbid}/resources",
         json={
             "title": "Immanuel Kant. Critique of Pure Reason",
-            "summary": "The book is a treatise on metaphysics. It was written by Immanuel Kant.",
+            "summary": "[SKU-567:8] The book is a treatise on metaphysics. It was written by Immanuel Kant.",
             "icon": "text/plain",
         },
     )
@@ -440,6 +440,15 @@ async def test_find_keyword_filters(
             ],
             [nietzsche_rid, kant_rid],
         ),
+        # Searching with ids that contain punctuation characters should work
+        (
+            ["SKU-123:4"],
+            [nietzsche_rid],
+        ),
+        (
+            ["SKU-567:8"],
+            [kant_rid],
+        ),
         # Negative tests (no results expected)
         (["Focault"], []),  # Keyword not present
         (["Nietz"], []),  # Partial matches
@@ -462,19 +471,3 @@ async def test_find_keyword_filters(
             assert (
                 rid in body["resources"]
             ), f"Keyword filters: {keyword_filters}, expected rids: {expected_rids}"
-
-
-async def test_find_keyword_filters_validation(nucliadb_reader: AsyncClient):
-    for invalid_character in ".,:)([]{}-_^%&$#":
-        resp = await nucliadb_reader.post(
-            "/kb/kbid/find",
-            json={
-                "query": "treatise",
-                "keyword_filters": [f"Foo{invalid_character}Bar"],
-            },
-        )
-        assert resp.status_code == 412
-        assert (
-            "Only alphanumeric strings with spaces are allowed in keyword filters"
-            in resp.json()["detail"]
-        )
diff --git a/nucliadb_texts2/src/query_io.rs b/nucliadb_texts2/src/query_io.rs
@@ -21,6 +21,7 @@ use crate::schema::TextSchema;
 use nucliadb_core::query_language::{BooleanExpression, BooleanOperation, Operator};
 use tantivy::query::{AllQuery, BooleanQuery, Occur, PhraseQuery, Query, TermQuery};
 use tantivy::schema::{Facet, IndexRecordOption};
+use tantivy::tokenizer::TokenizerManager;
 use tantivy::Term;
 
 fn translate_label_to_facet_query(literal: &str, schema: &TextSchema) -> Box<dyn Query> {
@@ -30,9 +31,14 @@ fn translate_label_to_facet_query(literal: &str, schema: &TextSchema) -> Box<dyn
 }
 
 fn translate_keyword_to_text_query(literal: &str, schema: &TextSchema) -> Box<dyn Query> {
-    let terms: Vec<Term> =
-        literal.split_whitespace().map(|w| Term::from_field_text(schema.text, &w.to_lowercase())).collect();
-
+    // Tokenize the literal in the same way we tokenize the text field at indexing time
+    let tokenizer = TokenizerManager::default().get("default").unwrap();
+    let mut token_stream = tokenizer.token_stream(literal);
+    let mut terms = Vec::new();
+    while let Some(token) = token_stream.next() {
+        terms.push(Term::from_field_text(schema.text, &token.text));
+    }
+    // Create a query using the tokenized terms
     if terms.len() == 1 {
         Box::new(TermQuery::new(terms[0].clone(), IndexRecordOption::Basic))
     } else {