Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Reconstitution] Improve reconstitution #1750

Merged
merged 9 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,10 @@ def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **
def synthesize(self, **kwargs) -> np.ndarray:
"""Synthesize the page from the predictions

Args:
----
**kwargs: keyword arguments passed to the `synthesize_page` method

Returns
-------
synthesized page
Expand Down Expand Up @@ -493,7 +497,7 @@ def synthesize(self, **kwargs) -> np.ndarray:

Args:
----
**kwargs: keyword arguments passed to the matplotlib.pyplot.show method
**kwargs: keyword arguments passed to the `synthesize_kie_page` method

Returns:
-------
Expand Down Expand Up @@ -603,11 +607,15 @@ def show(self, **kwargs) -> None:
def synthesize(self, **kwargs) -> List[np.ndarray]:
"""Synthesize all pages from their predictions

Args:
----
**kwargs: keyword arguments passed to the `Page.synthesize` method

Returns
-------
list of synthesized pages
"""
return [page.synthesize() for page in self.pages]
return [page.synthesize(**kwargs) for page in self.pages]

def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
"""Export the document as XML (hOCR-format)
Expand Down
216 changes: 151 additions & 65 deletions doctr/utils/reconstitution.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
import logging
from typing import Any, Dict, Optional

import numpy as np
Expand All @@ -13,61 +14,163 @@
__all__ = ["synthesize_page", "synthesize_kie_page"]


# Global variable to avoid multiple warnings
ROTATION_WARNING = False


def _warn_rotation(entry: Dict[str, Any]) -> None: # pragma: no cover
global ROTATION_WARNING
if not ROTATION_WARNING and len(entry["geometry"]) == 4:
logging.warning("Polygons with larger rotations will lead to inaccurate rendering")
ROTATION_WARNING = True


def _synthesize(
response: Image.Image,
entry: Dict[str, Any],
w: int,
h: int,
draw_proba: bool = False,
font_family: Optional[str] = None,
smoothing_factor: float = 0.75,
min_font_size: int = 6,
max_font_size: int = 50,
) -> Image.Image:
if len(entry["geometry"]) == 2:
(xmin, ymin), (xmax, ymax) = entry["geometry"]
polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
else:
polygon = entry["geometry"]

# Calculate the bounding box of the word
x_coords, y_coords = zip(*polygon)
xmin, ymin, xmax, ymax = (
int(round(w * min(x_coords))),
int(round(h * min(y_coords))),
int(round(w * max(x_coords))),
int(round(h * max(y_coords))),
)
word_width = xmax - xmin
word_height = ymax - ymin

# If lines are provided instead of words, concatenate the word entries
if "words" in entry:
word_text = " ".join(word["value"] for word in entry["words"])
else:
word_text = entry["value"]
# Find the optimal font size
try:
font_size = min(word_height, max_font_size)
font = get_font(font_family, font_size)
text_width, text_height = font.getbbox(word_text)[2:4]

while (text_width > word_width or text_height > word_height) and font_size > min_font_size:
font_size = max(int(font_size * smoothing_factor), min_font_size)
font = get_font(font_family, font_size)
text_width, text_height = font.getbbox(word_text)[2:4]
except ValueError:
font = get_font(font_family, min_font_size)

Check warning on line 72 in doctr/utils/reconstitution.py

View check run for this annotation

Codecov / codecov/patch

doctr/utils/reconstitution.py#L71-L72

Added lines #L71 - L72 were not covered by tests

# Create a mask for the word
mask = Image.new("L", (w, h), 0)
ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255)

# Draw the word text
d = ImageDraw.Draw(response)
try:
try:
d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt")
except UnicodeEncodeError:
d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt")

Check warning on line 84 in doctr/utils/reconstitution.py

View check run for this annotation

Codecov / codecov/patch

doctr/utils/reconstitution.py#L83-L84

Added lines #L83 - L84 were not covered by tests
# Catch generic exceptions to avoid crashing the whole rendering
except Exception: # pragma: no cover
logging.warning(f"Could not render word: {word_text}")

if draw_proba:
confidence = (
entry["confidence"]
if "confidence" in entry
else sum(w["confidence"] for w in entry["words"]) / len(entry["words"])
)
p = int(255 * confidence)
color = (255 - p, 0, p) # Red to blue gradient based on probability
d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2)

prob_font = get_font(font_family, 20)
prob_text = f"{confidence:.2f}"
prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4]

# Position the probability slightly above the bounding box
prob_x_offset = (word_width - prob_text_width) // 2
prob_y_offset = ymin - prob_text_height - 2
prob_y_offset = max(0, prob_y_offset)

d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt")

return response


def synthesize_page(
page: Dict[str, Any],
draw_proba: bool = False,
font_family: Optional[str] = None,
smoothing_factor: float = 0.95,
min_font_size: int = 8,
max_font_size: int = 50,
) -> np.ndarray:
"""Draw a the content of the element page (OCR response) on a blank page.

Args:
----
page: exported Page object to represent
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
font_size: size of the font, default font = 13
font_family: family of the font
smoothing_factor: factor to smooth the font size
min_font_size: minimum font size
max_font_size: maximum font size

Returns:
-------
the synthesized page
"""
# Draw template
h, w = page["dimensions"]
response = 255 * np.ones((h, w, 3), dtype=np.int32)
response = Image.new("RGB", (w, h), color=(255, 255, 255))

# Draw each word
for block in page["blocks"]:
for line in block["lines"]:
for word in line["words"]:
# Get absolute word geometry
(xmin, ymin), (xmax, ymax) = word["geometry"]
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))

# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
font = get_font(font_family, int(0.75 * (ymax - ymin)))
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
d = ImageDraw.Draw(img)
# Draw in black the value of the word
try:
d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
except UnicodeEncodeError:
# When character cannot be encoded, use its anyascii version
d.text((0, 0), anyascii(word["value"]), font=font, fill=(0, 0, 0))

# Colorize if draw_proba
if draw_proba:
p = int(255 * word["confidence"])
mask = np.where(np.array(img) == 0, 1, 0)
proba: np.ndarray = np.array([255 - p, 0, p])
color = mask * proba[np.newaxis, np.newaxis, :]
white_mask = 255 * (1 - mask)
img = color + white_mask

# Write to response page
response[ymin:ymax, xmin:xmax, :] = np.array(img)

return response
# If lines are provided use these to get better rendering results
if len(block["lines"]) > 1:
for line in block["lines"]:
_warn_rotation(block) # pragma: no cover
response = _synthesize(
response=response,
entry=line,
w=w,
h=h,
draw_proba=draw_proba,
font_family=font_family,
smoothing_factor=smoothing_factor,
min_font_size=min_font_size,
max_font_size=max_font_size,
)
# Otherwise, draw each word
else:
for line in block["lines"]:
_warn_rotation(block) # pragma: no cover
for word in line["words"]:
response = _synthesize(
response=response,
entry=word,
w=w,
h=h,
draw_proba=draw_proba,
font_family=font_family,
smoothing_factor=smoothing_factor,
min_font_size=min_font_size,
max_font_size=max_font_size,
)

return np.array(response, dtype=np.uint8)


def synthesize_kie_page(
Expand All @@ -81,46 +184,29 @@
----
page: exported Page object to represent
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
font_size: size of the font, default font = 13
font_family: family of the font
smoothing_factor: factor to smooth the font size
min_font_size: minimum font size
max_font_size: maximum font size

Returns:
-------
the synthesized page
"""
# Draw template
h, w = page["dimensions"]
response = 255 * np.ones((h, w, 3), dtype=np.int32)
response = Image.new("RGB", (w, h), color=(255, 255, 255))

# Draw each word
for predictions in page["predictions"].values():
for prediction in predictions:
# Get aboslute word geometry
(xmin, ymin), (xmax, ymax) = prediction["geometry"]
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))

# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
font = get_font(font_family, int(0.75 * (ymax - ymin)))
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
d = ImageDraw.Draw(img)
# Draw in black the value of the word
try:
d.text((0, 0), prediction["value"], font=font, fill=(0, 0, 0))
except UnicodeEncodeError:
# When character cannot be encoded, use its anyascii version
d.text((0, 0), anyascii(prediction["value"]), font=font, fill=(0, 0, 0))

# Colorize if draw_proba
if draw_proba:
p = int(255 * prediction["confidence"])
mask = np.where(np.array(img) == 0, 1, 0)
proba: np.ndarray = np.array([255 - p, 0, p])
color = mask * proba[np.newaxis, np.newaxis, :]
white_mask = 255 * (1 - mask)
img = color + white_mask

# Write to response page
response[ymin:ymax, xmin:xmax, :] = np.array(img)

return response
_warn_rotation(prediction) # pragma: no cover
response = _synthesize(
response=response,
entry=prediction,
w=w,
h=h,
draw_proba=draw_proba,
font_family=font_family,
)
return np.array(response, dtype=np.uint8)
42 changes: 37 additions & 5 deletions tests/common/test_utils_reconstitution.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,44 @@
import numpy as np
from test_io_elements import _mock_pages
from test_io_elements import _mock_kie_pages, _mock_pages

from doctr.utils import reconstitution


def test_synthesize_page():
pages = _mock_pages()
reconstitution.synthesize_page(pages[0].export(), draw_proba=False)
render = reconstitution.synthesize_page(pages[0].export(), draw_proba=True)
assert isinstance(render, np.ndarray)
assert render.shape == (*pages[0].dimensions, 3)
# Test without probability rendering
render_no_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=False)
assert isinstance(render_no_proba, np.ndarray)
assert render_no_proba.shape == (*pages[0].dimensions, 3)

# Test with probability rendering
render_with_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=True)
assert isinstance(render_with_proba, np.ndarray)
assert render_with_proba.shape == (*pages[0].dimensions, 3)

# Test with only one line
pages_one_line = pages[0].export()
pages_one_line["blocks"][0]["lines"] = [pages_one_line["blocks"][0]["lines"][0]]
render_one_line = reconstitution.synthesize_page(pages_one_line, draw_proba=True)
assert isinstance(render_one_line, np.ndarray)
assert render_one_line.shape == (*pages[0].dimensions, 3)

# Test with polygons
pages_poly = pages[0].export()
pages_poly["blocks"][0]["lines"][0]["geometry"] = [(0, 0), (0, 1), (1, 1), (1, 0)]
render_poly = reconstitution.synthesize_page(pages_poly, draw_proba=True)
assert isinstance(render_poly, np.ndarray)
assert render_poly.shape == (*pages[0].dimensions, 3)


def test_synthesize_kie_page():
pages = _mock_kie_pages()
# Test without probability rendering
render_no_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=False)
assert isinstance(render_no_proba, np.ndarray)
assert render_no_proba.shape == (*pages[0].dimensions, 3)

# Test with probability rendering
render_with_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=True)
assert isinstance(render_with_proba, np.ndarray)
assert render_with_proba.shape == (*pages[0].dimensions, 3)
Loading