Add sorted paragraphs in find results (#1687)

* done * add test * improve description
nuclia · Dec 21, 2023 · 9b5eb9c · 9b5eb9c · github-actions · Dec 21, 2023
1 parent 235344b
commit 9b5eb9c
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 0 deletions.
diff --git a/nucliadb/nucliadb/search/search/find_merge.py b/nucliadb/nucliadb/search/search/find_merge.py
@@ -139,6 +139,7 @@ def sorted_by_insertion(self) -> Iterator[Any]:
 @merge_observer.wrap({"type": "fetch_find_metadata"})
 async def fetch_find_metadata(
     find_resources: dict[str, FindResource],
+    best_matches: list[str],
     result_paragraphs: list[TempFindParagraph],
     kbid: str,
     show: list[ResourceProperties],
@@ -206,6 +207,7 @@ async def fetch_find_metadata(
         orderer.sorted_by_insertion()
     ):
         find_resources[rid].fields[field_id].paragraphs[paragraph_id].order = order
+        best_matches.append(paragraph_id)
 
     for resource in resources:
         operations.append(
@@ -408,10 +410,12 @@ async def find_merge_results(
             page_size=count,
             next_page=next_page,
             min_score=round(min_score, ndigits=3),
+            best_matches=[],
         )
 
         await fetch_find_metadata(
             api_results.resources,
+            api_results.best_matches,
             result_paragraphs,
             kbid,
             show,

diff --git a/nucliadb/nucliadb/tests/integration/test_find.py b/nucliadb/nucliadb/tests/integration/test_find.py
@@ -320,3 +320,32 @@ def check_fuzzy_paragraphs(find_response, *, fuzzy_result: bool, n_expected: int
                 assert paragraph["fuzzy_result"] is fuzzy_result
                 found += 1
     assert found == n_expected
+
+
+@pytest.mark.asyncio
+async def test_find_returns_best_matches(
+    nucliadb_reader: AsyncClient,
+    philosophy_books_kb,
+):
+    kbid = philosophy_books_kb
+
+    resp = await nucliadb_reader.post(
+        f"/kb/{kbid}/find",
+        json={
+            "query": "and",
+        },
+    )
+    assert resp.status_code == 200
+    body = resp.json()
+
+    best_matches = body["best_matches"]
+    paragraphs = []
+    for resource in body["resources"].values():
+        for field in resource["fields"].values():
+            for paragraph in field["paragraphs"].values():
+                paragraphs.append(paragraph)
+    assert len(paragraphs) == len(best_matches) > 2
+
+    # Check that best matches is sorted by the paragraph order
+    sorted_paragraphs = sorted(paragraphs, key=lambda p: p["order"])
+    assert [p["id"] for p in sorted_paragraphs] == best_matches
diff --git a/nucliadb_models/nucliadb_models/search.py b/nucliadb_models/nucliadb_models/search.py
@@ -878,6 +878,11 @@ class KnowledgeboxFindResults(JsonBaseModel):
     shards: Optional[List[str]] = None
     autofilters: List[str] = ModelParamDefaults.applied_autofilters.to_pydantic_field()
     min_score: float = ModelParamDefaults.min_score.to_pydantic_field()
+    best_matches: List[str] = Field(
+        default=[],
+        title="Best matches",
+        description="List of ids of best matching paragraphs. The list is sorted by decreasing relevance (most relevant first).",  # noqa
+    )
 
 
 class FeedbackTasks(str, Enum):