From 9b5eb9c388a2d1141ad9977b6cbd824803cdaca5 Mon Sep 17 00:00:00 2001 From: Ferran Llamas Date: Thu, 21 Dec 2023 12:59:27 +0100 Subject: [PATCH] Add sorted paragraphs in find results (#1687) * done * add test * improve description --- nucliadb/nucliadb/search/search/find_merge.py | 4 +++ .../nucliadb/tests/integration/test_find.py | 29 +++++++++++++++++++ nucliadb_models/nucliadb_models/search.py | 5 ++++ 3 files changed, 38 insertions(+) diff --git a/nucliadb/nucliadb/search/search/find_merge.py b/nucliadb/nucliadb/search/search/find_merge.py index 30f5944888..17d18e70fc 100644 --- a/nucliadb/nucliadb/search/search/find_merge.py +++ b/nucliadb/nucliadb/search/search/find_merge.py @@ -139,6 +139,7 @@ def sorted_by_insertion(self) -> Iterator[Any]: @merge_observer.wrap({"type": "fetch_find_metadata"}) async def fetch_find_metadata( find_resources: dict[str, FindResource], + best_matches: list[str], result_paragraphs: list[TempFindParagraph], kbid: str, show: list[ResourceProperties], @@ -206,6 +207,7 @@ async def fetch_find_metadata( orderer.sorted_by_insertion() ): find_resources[rid].fields[field_id].paragraphs[paragraph_id].order = order + best_matches.append(paragraph_id) for resource in resources: operations.append( @@ -408,10 +410,12 @@ async def find_merge_results( page_size=count, next_page=next_page, min_score=round(min_score, ndigits=3), + best_matches=[], ) await fetch_find_metadata( api_results.resources, + api_results.best_matches, result_paragraphs, kbid, show, diff --git a/nucliadb/nucliadb/tests/integration/test_find.py b/nucliadb/nucliadb/tests/integration/test_find.py index cca8a9f32f..a9d35e009e 100644 --- a/nucliadb/nucliadb/tests/integration/test_find.py +++ b/nucliadb/nucliadb/tests/integration/test_find.py @@ -320,3 +320,32 @@ def check_fuzzy_paragraphs(find_response, *, fuzzy_result: bool, n_expected: int assert paragraph["fuzzy_result"] is fuzzy_result found += 1 assert found == n_expected + + +@pytest.mark.asyncio +async def test_find_returns_best_matches( + nucliadb_reader: AsyncClient, + philosophy_books_kb, +): + kbid = philosophy_books_kb + + resp = await nucliadb_reader.post( + f"/kb/{kbid}/find", + json={ + "query": "and", + }, + ) + assert resp.status_code == 200 + body = resp.json() + + best_matches = body["best_matches"] + paragraphs = [] + for resource in body["resources"].values(): + for field in resource["fields"].values(): + for paragraph in field["paragraphs"].values(): + paragraphs.append(paragraph) + assert len(paragraphs) == len(best_matches) > 2 + + # Check that best matches is sorted by the paragraph order + sorted_paragraphs = sorted(paragraphs, key=lambda p: p["order"]) + assert [p["id"] for p in sorted_paragraphs] == best_matches diff --git a/nucliadb_models/nucliadb_models/search.py b/nucliadb_models/nucliadb_models/search.py index f25f25384c..916056c0ca 100644 --- a/nucliadb_models/nucliadb_models/search.py +++ b/nucliadb_models/nucliadb_models/search.py @@ -878,6 +878,11 @@ class KnowledgeboxFindResults(JsonBaseModel): shards: Optional[List[str]] = None autofilters: List[str] = ModelParamDefaults.applied_autofilters.to_pydantic_field() min_score: float = ModelParamDefaults.min_score.to_pydantic_field() + best_matches: List[str] = Field( + default=[], + title="Best matches", + description="List of ids of best matching paragraphs. The list is sorted by decreasing relevance (most relevant first).", # noqa + ) class FeedbackTasks(str, Enum):