Skip to content

Commit

Permalink
Add sorted paragraphs in find results (#1687)
Browse files Browse the repository at this point in the history
* done

* add test

* improve description
  • Loading branch information
lferran authored Dec 21, 2023
1 parent 235344b commit 9b5eb9c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 0 deletions.
4 changes: 4 additions & 0 deletions nucliadb/nucliadb/search/search/find_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def sorted_by_insertion(self) -> Iterator[Any]:
@merge_observer.wrap({"type": "fetch_find_metadata"})
async def fetch_find_metadata(
find_resources: dict[str, FindResource],
best_matches: list[str],
result_paragraphs: list[TempFindParagraph],
kbid: str,
show: list[ResourceProperties],
Expand Down Expand Up @@ -206,6 +207,7 @@ async def fetch_find_metadata(
orderer.sorted_by_insertion()
):
find_resources[rid].fields[field_id].paragraphs[paragraph_id].order = order
best_matches.append(paragraph_id)

for resource in resources:
operations.append(
Expand Down Expand Up @@ -408,10 +410,12 @@ async def find_merge_results(
page_size=count,
next_page=next_page,
min_score=round(min_score, ndigits=3),
best_matches=[],
)

await fetch_find_metadata(
api_results.resources,
api_results.best_matches,
result_paragraphs,
kbid,
show,
Expand Down
29 changes: 29 additions & 0 deletions nucliadb/nucliadb/tests/integration/test_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,32 @@ def check_fuzzy_paragraphs(find_response, *, fuzzy_result: bool, n_expected: int
assert paragraph["fuzzy_result"] is fuzzy_result
found += 1
assert found == n_expected


@pytest.mark.asyncio
async def test_find_returns_best_matches(
nucliadb_reader: AsyncClient,
philosophy_books_kb,
):
kbid = philosophy_books_kb

resp = await nucliadb_reader.post(
f"/kb/{kbid}/find",
json={
"query": "and",
},
)
assert resp.status_code == 200
body = resp.json()

best_matches = body["best_matches"]
paragraphs = []
for resource in body["resources"].values():
for field in resource["fields"].values():
for paragraph in field["paragraphs"].values():
paragraphs.append(paragraph)
assert len(paragraphs) == len(best_matches) > 2

# Check that best matches is sorted by the paragraph order
sorted_paragraphs = sorted(paragraphs, key=lambda p: p["order"])
assert [p["id"] for p in sorted_paragraphs] == best_matches
5 changes: 5 additions & 0 deletions nucliadb_models/nucliadb_models/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,11 @@ class KnowledgeboxFindResults(JsonBaseModel):
shards: Optional[List[str]] = None
autofilters: List[str] = ModelParamDefaults.applied_autofilters.to_pydantic_field()
min_score: float = ModelParamDefaults.min_score.to_pydantic_field()
best_matches: List[str] = Field(
default=[],
title="Best matches",
description="List of ids of best matching paragraphs. The list is sorted by decreasing relevance (most relevant first).", # noqa
)


class FeedbackTasks(str, Enum):
Expand Down

3 comments on commit 9b5eb9c

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmark

Benchmark suite Current: 9b5eb9c Previous: 5a633b0 Ratio
nucliadb/search/tests/unit/search/test_fetch.py::test_highligh_error 12816.628752282295 iter/sec (stddev: 2.0355081047455393e-7) 12745.686329086004 iter/sec (stddev: 1.7317806991721728e-7) 0.99

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmark

Benchmark suite Current: 9b5eb9c Previous: 5a633b0 Ratio
nucliadb/search/tests/unit/search/test_fetch.py::test_highligh_error 12850.270419330267 iter/sec (stddev: 0.000001789047790299129) 12745.686329086004 iter/sec (stddev: 1.7317806991721728e-7) 0.99

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmark

Benchmark suite Current: 9b5eb9c Previous: 5a633b0 Ratio
nucliadb/search/tests/unit/search/test_fetch.py::test_highligh_error 12958.104664779648 iter/sec (stddev: 5.611689711665966e-7) 12745.686329086004 iter/sec (stddev: 1.7317806991721728e-7) 0.98

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.