Skip to content

Commit

Permalink
Return whether paragaph search results are fuzzy or not (#1673)
Browse files Browse the repository at this point in the history
  • Loading branch information
lferran authored Dec 18, 2023
1 parent 5102ed0 commit 55b59ae
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 12 deletions.
20 changes: 8 additions & 12 deletions nucliadb/nucliadb/search/search/find_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ async def set_text_value(
ematches: Optional[List[str]] = None,
extracted_text_cache: Optional[paragraphs.ExtractedTextCache] = None,
):
# TODO: Improve
await max_operations.acquire()
try:
async with max_operations:
assert result_paragraph.paragraph
assert result_paragraph.paragraph.position
result_paragraph.paragraph.text = await paragraphs.get_paragraph_text(
Expand All @@ -81,8 +79,6 @@ async def set_text_value(
matches=[], # TODO
extracted_text_cache=extracted_text_cache,
)
finally:
max_operations.release()


@merge_observer.wrap({"type": "set_resource_metadada_value"})
Expand All @@ -95,9 +91,7 @@ async def set_resource_metadata_value(
find_resources: Dict[str, FindResource],
max_operations: asyncio.Semaphore,
):
await max_operations.acquire()

try:
async with max_operations:
serialized_resource = await serialize(
kbid,
resource,
Expand All @@ -112,9 +106,6 @@ async def set_resource_metadata_value(
logger.warning(f"Resource {resource} not found in {kbid}")
find_resources.pop(resource, None)

finally:
max_operations.release()


class Orderer:
def __init__(self):
Expand Down Expand Up @@ -248,6 +239,7 @@ def merge_paragraphs_vectors(
# We assume that paragraphs_shards and vectors_shards are already ordered
for paragraphs_shard in paragraphs_shards:
for paragraph in paragraphs_shard:
fuzzy_result = len(paragraph.matches) > 0
merged_paragrahs.append(
TempFindParagraph(
paragraph_index=paragraph,
Expand All @@ -258,6 +250,7 @@ def merge_paragraphs_vectors(
split=paragraph.split,
end=paragraph.end,
id=paragraph.paragraph,
fuzzy_result=fuzzy_result,
)
)

Expand Down Expand Up @@ -322,8 +315,10 @@ def merge_paragraphs_vectors(
],
),
id=merged_paragraph.id,
# Vector searches don't have fuzziness
fuzzy_result=False,
)
if merged_paragraph.paragraph_index is not None:
elif merged_paragraph.paragraph_index is not None:
merged_paragraph.paragraph = FindParagraph(
score=merged_paragraph.paragraph_index.score.bm25,
score_type=SCORE_TYPE.BM25,
Expand All @@ -344,6 +339,7 @@ def merge_paragraphs_vectors(
],
),
id=merged_paragraph.id,
fuzzy_result=merged_paragraph.fuzzy_result,
)
return merged_paragrahs, next_page

Expand Down
2 changes: 2 additions & 0 deletions nucliadb/nucliadb/search/search/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ async def merge_paragraph_results(
extracted_text_cache=etcache,
)
labels = await get_labels_paragraph(result, kbid)
fuzzy_result = len(result.matches) > 0
new_paragraph = Paragraph(
score=result.score.bm25,
rid=result.uuid,
Expand All @@ -414,6 +415,7 @@ async def merge_paragraph_results(
end=result.metadata.position.end,
page_number=result.metadata.position.page_number,
),
fuzzy_result=fuzzy_result,
)
if len(result.metadata.position.start_seconds) or len(
result.metadata.position.end_seconds
Expand Down
58 changes: 58 additions & 0 deletions nucliadb/nucliadb/tests/integration/search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -1441,3 +1441,61 @@ async def test_facets_validation(
else:
assert resp.status_code == 422
assert error_message == resp.json()["detail"][0]["msg"]


@pytest.mark.asyncio
@pytest.mark.parametrize("knowledgebox", ("EXPERIMENTAL", "STABLE"), indirect=True)
async def test_search_marks_fuzzy_results(
nucliadb_reader: AsyncClient,
nucliadb_writer: AsyncClient,
knowledgebox,
):
resp = await nucliadb_writer.post(
f"/kb/{knowledgebox}/resources",
json={
"slug": "myresource",
"title": "My Title",
},
)
assert resp.status_code == 201

# Should get only one non-fuzzy result
resp = await nucliadb_reader.post(
f"/kb/{knowledgebox}/search",
json={
"query": "Title",
},
)
assert resp.status_code == 200
body = resp.json()
check_fuzzy_paragraphs(body, fuzzy_result=False, n_expected=1)

# Should get only one fuzzy result
resp = await nucliadb_reader.post(
f"/kb/{knowledgebox}/search",
json={
"query": "totle",
},
)
assert resp.status_code == 200
body = resp.json()
check_fuzzy_paragraphs(body, fuzzy_result=True, n_expected=1)

# Should not get any result if exact match term queried
resp = await nucliadb_reader.post(
f"/kb/{knowledgebox}/search",
json={
"query": '"totle"',
},
)
assert resp.status_code == 200
body = resp.json()
check_fuzzy_paragraphs(body, fuzzy_result=True, n_expected=0)


def check_fuzzy_paragraphs(search_response, *, fuzzy_result: bool, n_expected: int):
found = 0
for paragraph in search_response["paragraphs"]["results"]:
assert paragraph["fuzzy_result"] is fuzzy_result
found += 1
assert found == n_expected
60 changes: 60 additions & 0 deletions nucliadb/nucliadb/tests/integration/test_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,63 @@ async def test_story_7286(
body = resp.json()
assert len(body["resources"]) == 0
assert caplog.record_tuples[0][2] == f"Resource {rid} not found in {knowledgebox}"


@pytest.mark.asyncio
@pytest.mark.parametrize("knowledgebox", ("EXPERIMENTAL", "STABLE"), indirect=True)
async def test_find_marks_fuzzy_results(
nucliadb_reader: AsyncClient,
nucliadb_writer: AsyncClient,
knowledgebox,
):
resp = await nucliadb_writer.post(
f"/kb/{knowledgebox}/resources",
json={
"slug": "myresource",
"title": "My Title",
},
)
assert resp.status_code == 201

# Should get only one non-fuzzy result
resp = await nucliadb_reader.post(
f"/kb/{knowledgebox}/find",
json={
"query": "Title",
},
)
assert resp.status_code == 200
body = resp.json()
check_fuzzy_paragraphs(body, fuzzy_result=False, n_expected=1)

# Should get only one fuzzy result
resp = await nucliadb_reader.post(
f"/kb/{knowledgebox}/find",
json={
"query": "totle",
},
)
assert resp.status_code == 200
body = resp.json()
check_fuzzy_paragraphs(body, fuzzy_result=True, n_expected=1)

# Should not get any result if exact match term queried
resp = await nucliadb_reader.post(
f"/kb/{knowledgebox}/find",
json={
"query": '"totle"',
},
)
assert resp.status_code == 200
body = resp.json()
check_fuzzy_paragraphs(body, fuzzy_result=True, n_expected=0)


def check_fuzzy_paragraphs(find_response, *, fuzzy_result: bool, n_expected: int):
found = 0
for resource in find_response["resources"].values():
for field in resource["fields"].values():
for paragraph in field["paragraphs"].values():
assert paragraph["fuzzy_result"] is fuzzy_result
found += 1
assert found == n_expected
3 changes: 3 additions & 0 deletions nucliadb_models/nucliadb_models/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ class Paragraph(BaseModel):
start_seconds: Optional[List[int]] = None
end_seconds: Optional[List[int]] = None
position: Optional[TextPosition] = None
fuzzy_result: bool = False


class Paragraphs(BaseModel):
Expand Down Expand Up @@ -825,6 +826,7 @@ class FindParagraph(BaseModel):
id: str
labels: Optional[List[str]] = []
position: Optional[TextPosition] = None
fuzzy_result: bool = False


@dataclass
Expand All @@ -839,6 +841,7 @@ class TempFindParagraph:
paragraph: Optional[FindParagraph] = None
vector_index: Optional[DocumentScored] = None
paragraph_index: Optional[PBParagraphResult] = None
fuzzy_result: bool = False


class FindField(BaseModel):
Expand Down

1 comment on commit 55b59ae

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmark

Benchmark suite Current: 55b59ae Previous: 5a633b0 Ratio
nucliadb/search/tests/unit/search/test_fetch.py::test_highligh_error 12939.399709609745 iter/sec (stddev: 0.000002461444765023132) 12745.686329086004 iter/sec (stddev: 1.7317806991721728e-7) 0.99

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.