Skip to content

Commit

Permalink
Fix regex highlight problem with unescaped text (#2220)
Browse files Browse the repository at this point in the history
* Fix regex highlight problem with unescaped text

* Simplify
  • Loading branch information
jotare authored Jun 5, 2024
1 parent 12ad092 commit d39e682
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 8 deletions.
13 changes: 5 additions & 8 deletions nucliadb/src/nucliadb/search/search/paragraphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,22 +216,19 @@ async def get_text_sentence(
return splitted_text


def get_regex(some_string: str) -> str:
return r"\b" + some_string.lower() + r"\b"


def highlight_paragraph(
text: str, words: Optional[list[str]] = None, ematches: Optional[list[str]] = None
) -> str:
REGEX_TEMPLATE = r"(^|\s)({text})(\s|$)"
text_lower = text.lower()

marks = [0] * (len(text_lower) + 1)
if ematches is not None:
for quote in ematches:
quote_regex = get_regex(quote.lower())
quote_regex = REGEX_TEMPLATE.format(text=re.escape(quote.lower()))
try:
for match in re.finditer(quote_regex, text_lower):
start, end = match.span()
start, end = match.span(2)
marks[start] = 1
marks[end] = 2
except re.error:
Expand All @@ -242,10 +239,10 @@ def highlight_paragraph(

words = words or []
for word in words:
word_regex = get_regex(word.lower())
word_regex = REGEX_TEMPLATE.format(text=re.escape(word.lower()))
try:
for match in re.finditer(word_regex, text_lower):
start, end = match.span()
start, end = match.span(2)
if marks[start] == 0 and marks[end] == 0:
marks[start] = 1
marks[end] = 2
Expand Down
22 changes: 22 additions & 0 deletions nucliadb/tests/search/unit/search/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,28 @@ def test_highligh_error(benchmark: BenchmarkFixture):
)


def test_highlight_handles_unescaped_sequences():
text = "this is (not unsafe), just normal text"
res = highlight(text, ["(not", ".*unsafe.*"])
assert res == "this is <mark>(not</mark> unsafe), just normal text"

text = "should this match? The answer is..."
res = highlight(text, ["match?)"])
assert res == text

text = "unsafe.* texts now can match safely"
res = highlight(text, ["unsafe.*", "safe"])
assert res == "<mark>unsafe.*</mark> texts now can match safely"

text = "l'estany l'il·luminat?"
res = highlight(text, ["l'il·luminat?"])
assert res == "l'estany <mark>l'il·luminat?</mark>"

# text = "(w.*o.*r.*d)"
# res = highlight(text, ["o.*"])
# assert res == "(w.*<mark>o.*</mark>r.*d)"


def test_highlight():
res = highlight(
"Query whatever you want my to make it work my query with this",
Expand Down

0 comments on commit d39e682

Please sign in to comment.