From 9b5eb9c388a2d1141ad9977b6cbd824803cdaca5 Mon Sep 17 00:00:00 2001
From: Ferran Llamas <ferran@nuclia.com>
Date: Thu, 21 Dec 2023 12:59:27 +0100
Subject: [PATCH] Add sorted paragraphs in find results (#1687)

* done

* add test

* improve description
---
 nucliadb/nucliadb/search/search/find_merge.py |  4 +++
 .../nucliadb/tests/integration/test_find.py   | 29 +++++++++++++++++++
 nucliadb_models/nucliadb_models/search.py     |  5 ++++
 3 files changed, 38 insertions(+)

diff --git a/nucliadb/nucliadb/search/search/find_merge.py b/nucliadb/nucliadb/search/search/find_merge.py
index 30f5944888..17d18e70fc 100644
--- a/nucliadb/nucliadb/search/search/find_merge.py
+++ b/nucliadb/nucliadb/search/search/find_merge.py
@@ -139,6 +139,7 @@ def sorted_by_insertion(self) -> Iterator[Any]:
 @merge_observer.wrap({"type": "fetch_find_metadata"})
 async def fetch_find_metadata(
     find_resources: dict[str, FindResource],
+    best_matches: list[str],
     result_paragraphs: list[TempFindParagraph],
     kbid: str,
     show: list[ResourceProperties],
@@ -206,6 +207,7 @@ async def fetch_find_metadata(
         orderer.sorted_by_insertion()
     ):
         find_resources[rid].fields[field_id].paragraphs[paragraph_id].order = order
+        best_matches.append(paragraph_id)
 
     for resource in resources:
         operations.append(
@@ -408,10 +410,12 @@ async def find_merge_results(
             page_size=count,
             next_page=next_page,
             min_score=round(min_score, ndigits=3),
+            best_matches=[],
         )
 
         await fetch_find_metadata(
             api_results.resources,
+            api_results.best_matches,
             result_paragraphs,
             kbid,
             show,
diff --git a/nucliadb/nucliadb/tests/integration/test_find.py b/nucliadb/nucliadb/tests/integration/test_find.py
index cca8a9f32f..a9d35e009e 100644
--- a/nucliadb/nucliadb/tests/integration/test_find.py
+++ b/nucliadb/nucliadb/tests/integration/test_find.py
@@ -320,3 +320,32 @@ def check_fuzzy_paragraphs(find_response, *, fuzzy_result: bool, n_expected: int
                 assert paragraph["fuzzy_result"] is fuzzy_result
                 found += 1
     assert found == n_expected
+
+
+@pytest.mark.asyncio
+async def test_find_returns_best_matches(
+    nucliadb_reader: AsyncClient,
+    philosophy_books_kb,
+):
+    kbid = philosophy_books_kb
+
+    resp = await nucliadb_reader.post(
+        f"/kb/{kbid}/find",
+        json={
+            "query": "and",
+        },
+    )
+    assert resp.status_code == 200
+    body = resp.json()
+
+    best_matches = body["best_matches"]
+    paragraphs = []
+    for resource in body["resources"].values():
+        for field in resource["fields"].values():
+            for paragraph in field["paragraphs"].values():
+                paragraphs.append(paragraph)
+    assert len(paragraphs) == len(best_matches) > 2
+
+    # Check that best matches is sorted by the paragraph order
+    sorted_paragraphs = sorted(paragraphs, key=lambda p: p["order"])
+    assert [p["id"] for p in sorted_paragraphs] == best_matches
diff --git a/nucliadb_models/nucliadb_models/search.py b/nucliadb_models/nucliadb_models/search.py
index f25f25384c..916056c0ca 100644
--- a/nucliadb_models/nucliadb_models/search.py
+++ b/nucliadb_models/nucliadb_models/search.py
@@ -878,6 +878,11 @@ class KnowledgeboxFindResults(JsonBaseModel):
     shards: Optional[List[str]] = None
     autofilters: List[str] = ModelParamDefaults.applied_autofilters.to_pydantic_field()
     min_score: float = ModelParamDefaults.min_score.to_pydantic_field()
+    best_matches: List[str] = Field(
+        default=[],
+        title="Best matches",
+        description="List of ids of best matching paragraphs. The list is sorted by decreasing relevance (most relevant first).",  # noqa
+    )
 
 
 class FeedbackTasks(str, Enum):