Skip to content

Commit

Permalink
Apply hidden_resources to suggest queries (#2487)
Browse files Browse the repository at this point in the history
  • Loading branch information
javitonino authored Sep 24, 2024
1 parent 135ae0c commit deb6751
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 17 deletions.
5 changes: 3 additions & 2 deletions nucliadb/src/nucliadb/ingest/orm/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from nucliadb.common import ids
from nucliadb.ingest import logger
from nucliadb.ingest.orm.utils import compute_paragraph_key
from nucliadb_models.labels import BASE_LABELS, flatten_resource_labels
from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
from nucliadb_models.metadata import ResourceProcessingStatus
from nucliadb_protos import utils_pb2
from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
Expand Down Expand Up @@ -509,7 +509,8 @@ def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):

# hidden
if basic.hidden:
self.labels["q"].append("h")
_, p1, p2 = LABEL_HIDDEN.split("/")
self.labels[p1].append(p2)

self.compute_labels()

Expand Down
6 changes: 6 additions & 0 deletions nucliadb/src/nucliadb/search/api/v1/suggest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from nucliadb.search.search.exceptions import InvalidQueryError
from nucliadb.search.search.merge import merge_suggest_results
from nucliadb.search.search.query import suggest_query_to_pb
from nucliadb.search.search.utils import filter_hidden_resources
from nucliadb_models.common import FieldTypeName
from nucliadb_models.resource import NucliaDBRoles
from nucliadb_models.search import (
Expand Down Expand Up @@ -81,6 +82,7 @@ async def suggest_knowledgebox(
x_forwarded_for: str = Header(""),
debug: bool = fastapi_query(SearchParamDefaults.debug),
highlight: bool = fastapi_query(SearchParamDefaults.highlight),
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden, include_in_schema=False),
) -> Union[KnowledgeboxSuggestResults, HTTPClientError]:
try:
return await suggest(
Expand All @@ -102,6 +104,7 @@ async def suggest_knowledgebox(
x_forwarded_for,
debug,
highlight,
show_hidden,
)
except InvalidQueryError as exc:
return HTTPClientError(status_code=412, detail=str(exc))
Expand All @@ -126,8 +129,10 @@ async def suggest(
x_forwarded_for: str,
debug: bool,
highlight: bool,
show_hidden: bool,
) -> KnowledgeboxSuggestResults:
with cache.request_caches():
hidden = await filter_hidden_resources(kbid, show_hidden)
pb_query = suggest_query_to_pb(
features,
query,
Expand All @@ -138,6 +143,7 @@ async def suggest(
range_creation_end,
range_modification_start,
range_modification_end,
hidden,
)
results, incomplete_results, queried_nodes = await node_query(kbid, Method.SUGGEST, pb_query)

Expand Down
22 changes: 17 additions & 5 deletions nucliadb/src/nucliadb/search/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
)
from nucliadb.search.utilities import get_predict
from nucliadb_models.internal.predict import QueryInfo
from nucliadb_models.labels import translate_system_to_alias_label
from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
from nucliadb_models.metadata import ResourceProcessingStatus
from nucliadb_models.search import (
Filter,
Expand Down Expand Up @@ -128,9 +128,9 @@ def __init__(
self.hidden = hidden
if self.hidden is not None:
if self.hidden:
label_filters.append(Filter(all=["/q/h"])) # type: ignore
label_filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
else:
label_filters.append(Filter(none=["/q/h"])) # type: ignore
label_filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore

self.label_filters: dict[str, Any] = convert_to_node_filters(label_filters)
self.flat_label_filters: list[str] = []
Expand Down Expand Up @@ -709,6 +709,7 @@ def suggest_query_to_pb(
range_creation_end: Optional[datetime] = None,
range_modification_start: Optional[datetime] = None,
range_modification_end: Optional[datetime] = None,
hidden: Optional[bool] = None,
) -> nodereader_pb2.SuggestRequest:
request = nodereader_pb2.SuggestRequest()

Expand All @@ -718,10 +719,21 @@ def suggest_query_to_pb(

if SuggestOptions.PARAGRAPH in features:
request.features.append(nodereader_pb2.SuggestFeatures.PARAGRAPHS)
filters = [translate_label(fltr) for fltr in filters]
request.filter.field_labels.extend(filters)
request.fields.extend(fields)

if hidden is not None:
if hidden:
filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
else:
filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore

expression = convert_to_node_filters(filters)
if expression:
expression = translate_label_filters(expression)

request.filter.field_labels.extend(flatten_filter_literals(expression))
request.filter.labels_expression = json.dumps(expression)

if range_creation_start is not None:
request.timestamps.from_created.FromDatetime(range_creation_start)
if range_creation_end is not None:
Expand Down
2 changes: 1 addition & 1 deletion nucliadb/tests/nucliadb/integration/search/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ async def _test_filtering(nucliadb_reader: AsyncClient, kbid: str, filters):
]

# Check that only the expected paragraphs were returned
assert len(paragraphs) == len(expected_paragraphs)
assert len(paragraphs) == len(expected_paragraphs), f"{filters}\n{paragraphs}\n{expected_paragraphs}"
not_yet_found = expected_paragraphs.copy()
for par in paragraphs:
if par["text"] not in expected_paragraphs:
Expand Down
16 changes: 14 additions & 2 deletions nucliadb/tests/nucliadb/integration/search/test_hidden.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@

from nucliadb.common.context import ApplicationContext
from nucliadb_protos.writer_pb2_grpc import WriterStub
from tests.utils import broker_resource, inject_message
from tests.utils import broker_resource_with_title_paragraph, inject_message


async def create_resource(kbid, nucliadb_grpc):
message = broker_resource(kbid)
message = broker_resource_with_title_paragraph(kbid)
await inject_message(nucliadb_grpc, message)
return message.uuid

Expand All @@ -53,6 +53,10 @@ async def test_hidden_search(
assert resp.status_code == 200
assert resp.json()["resources"].keys() == {r1, r2}

resp = await nucliadb_reader.get(f"/kb/{knowledgebox}/suggest?query=title")
assert resp.status_code == 200
assert set([r["rid"] for r in resp.json()["paragraphs"]["results"]]) == {r1, r2}

# Hide r1
resp = await nucliadb_writer.patch(f"/kb/{knowledgebox}/resource/{r1}", json={"hidden": True})
assert resp.status_code == 200
Expand All @@ -63,11 +67,19 @@ async def test_hidden_search(
assert resp.status_code == 200
assert resp.json()["resources"].keys() == {r2}

resp = await nucliadb_reader.get(f"/kb/{knowledgebox}/suggest?query=title")
assert resp.status_code == 200
assert set([r["rid"] for r in resp.json()["paragraphs"]["results"]]) == {r2}

# Unless show_hidden is passed, then both resources are returned
resp = await nucliadb_reader.get(f"/kb/{knowledgebox}/search?show_hidden=true")
assert resp.status_code == 200
assert resp.json()["resources"].keys() == {r1, r2}

resp = await nucliadb_reader.get(f"/kb/{knowledgebox}/suggest?query=title&show_hidden=true")
assert resp.status_code == 200
assert set([r["rid"] for r in resp.json()["paragraphs"]["results"]]) == {r1, r2}


@pytest.fixture()
async def app_context(natsd, storage, nucliadb):
Expand Down
52 changes: 52 additions & 0 deletions nucliadb/tests/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,58 @@ def broker_resource(kbid: str, rid=None, slug=None, title=None, summary=None) ->
return bm


def broker_resource_with_title_paragraph(
kbid: str, rid=None, slug=None, title=None, summary=None
) -> BrokerMessage:
"""
Returns a broker resource with barebones metadata.
"""
rid = rid or str(uuid.uuid4()).replace("-", "")
slug = slug or f"{rid}slug1"
bm: BrokerMessage = BrokerMessage(
kbid=kbid,
uuid=rid,
slug=slug,
type=BrokerMessage.AUTOCOMMIT,
)
title = title or "Title Resource"
summary = summary or "Summary of document"
bm.basic.icon = "text/plain"
bm.basic.title = title
bm.basic.summary = summary
bm.basic.thumbnail = "doc"
bm.basic.metadata.useful = True
bm.basic.metadata.language = "es"
bm.basic.created.FromDatetime(datetime.now())
bm.basic.modified.FromDatetime(datetime.now())
bm.origin.source = rpb.Origin.Source.WEB

etw = rpb.ExtractedTextWrapper()
etw.body.text = title
etw.field.field = "title"
etw.field.field_type = rpb.FieldType.GENERIC
bm.extracted_text.append(etw)

fcm = rpb.FieldComputedMetadataWrapper()
fcm.field.field = "title"
fcm.field.field_type = rpb.FieldType.GENERIC
p1 = rpb.Paragraph(
start=0,
end=5,
)
fcm.metadata.metadata.paragraphs.append(p1)
bm.field_metadata.append(fcm)

etw = rpb.ExtractedTextWrapper()
etw.body.text = summary
etw.field.field = "summary"
etw.field.field_type = rpb.FieldType.GENERIC
bm.extracted_text.append(etw)

bm.source = BrokerMessage.MessageSource.WRITER
return bm


async def inject_message(writer: WriterStub, message: BrokerMessage):
resp = await writer.ProcessMessage([message]) # type: ignore
assert resp.status == OpStatusWriter.Status.OK
2 changes: 2 additions & 0 deletions nucliadb_models/src/nucliadb_models/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@

LABEL_QUERY_ALIASES_REVERSED = {v: k for k, v in LABEL_QUERY_ALIASES.items()}

LABEL_HIDDEN = "/q/h"


def translate_alias_to_system_label(label: str) -> str:
parts = label.split("/")
Expand Down
31 changes: 24 additions & 7 deletions nucliadb_node/src/shards/shard_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ use nucliadb_core::protos::{
ShardFile, ShardFileChunk, ShardFileList, StreamRequest, SuggestFeatures, SuggestRequest, SuggestResponse,
VectorSearchRequest, VectorSearchResponse,
};
use nucliadb_core::query_language;
use nucliadb_core::query_language::BooleanExpression;
use nucliadb_core::query_language::BooleanOperation;
use nucliadb_core::query_language::Operator;
Expand Down Expand Up @@ -372,29 +373,45 @@ impl ShardReader {
// Prefilter to apply field label filters
if let Some(filter) = &mut request.filter {
if !filter.field_labels.is_empty() && suggest_paragraphs {
let labels = std::mem::take(&mut filter.field_labels);
let operands = labels.into_iter().map(BooleanExpression::Literal).collect();
let op = BooleanOperation {
operator: Operator::And,
operands,
let labels_formula = if filter.labels_expression.is_empty() {
// Backwards compatibility, take all labels to be AND'ed together
let labels = std::mem::take(&mut filter.field_labels);
let operands = labels.into_iter().map(BooleanExpression::Literal).collect();
let op = BooleanOperation {
operator: Operator::And,
operands,
};

Some(BooleanExpression::Operation(op))
} else {
// Parse the formula for labels, suggest only supports resource labels
let context = query_language::QueryContext {
field_labels: filter.field_labels.iter().cloned().collect(),
paragraph_labels: HashSet::new(),
};
let analysis = query_language::translate(Some(&filter.labels_expression), None, &context)?;
analysis.labels_prefilter_query
};

let prefilter = PreFilterRequest {
timestamp_filters: vec![],
security: None,
labels_formula: Some(BooleanExpression::Operation(op)),
labels_formula,
keywords_formula: None,
};

let prefiltered = read_rw_lock(&self.text_reader).prefilter(&prefilter)?;

// Apply prefilter to paragraphs query
// Apply prefilter to paragraphs query and clear filters
match prefiltered.valid_fields {
query_planner::ValidFieldCollector::All => {}
query_planner::ValidFieldCollector::Some(keys) => {
request.key_filters = keys.iter().map(|v| format!("{}{}", v.resource_id, v.field_id)).collect()
}
query_planner::ValidFieldCollector::None => suggest_paragraphs = false,
}
filter.labels_expression.clear();
filter.field_labels.clear();
}
}

Expand Down
36 changes: 36 additions & 0 deletions nucliadb_node/tests/test_suggest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,42 @@ async fn test_suggest_paragraphs(
paragraph_labels: vec![],
..Default::default()
}),
..request.clone()
}))
.await
.unwrap()
.into_inner();

expect_paragraphs(&response, &[]);

// Inverted filter
let response = reader
.suggest(Request::new(SuggestRequest {
filter: Some(Filter {
field_labels: vec!["/s/p/de".to_string()],
labels_expression: r#"{"not": {"literal": "/s/p/de"}}"#.to_string(),
paragraph_labels: vec![],
..Default::default()
}),
..request.clone()
}))
.await
.unwrap()
.into_inner();

expect_paragraphs(
&response,
&[(&shard.resources["little prince"], "/a/title"), (&shard.resources["little prince"], "/a/summary")],
);

let response = reader
.suggest(Request::new(SuggestRequest {
filter: Some(Filter {
field_labels: vec!["/s/p/en".to_string()],
labels_expression: r#"{"not": {"literal": "/s/p/en"}}"#.to_string(),
paragraph_labels: vec![],
..Default::default()
}),
..request
}))
.await
Expand Down

0 comments on commit deb6751

Please sign in to comment.