diff --git a/doctr/io/elements.py b/doctr/io/elements.py index b27ecb35eb..2b4d0b0581 100644 --- a/doctr/io/elements.py +++ b/doctr/io/elements.py @@ -310,6 +310,10 @@ def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, ** def synthesize(self, **kwargs) -> np.ndarray: """Synthesize the page from the predictions + Args: + ---- + **kwargs: keyword arguments passed to the `synthesize_page` method + Returns ------- synthesized page @@ -493,7 +497,7 @@ def synthesize(self, **kwargs) -> np.ndarray: Args: ---- - **kwargs: keyword arguments passed to the matplotlib.pyplot.show method + **kwargs: keyword arguments passed to the `synthesize_kie_page` method Returns: ------- @@ -603,11 +607,15 @@ def show(self, **kwargs) -> None: def synthesize(self, **kwargs) -> List[np.ndarray]: """Synthesize all pages from their predictions + Args: + ---- + **kwargs: keyword arguments passed to the `Page.synthesize` method + Returns ------- list of synthesized pages """ - return [page.synthesize() for page in self.pages] + return [page.synthesize(**kwargs) for page in self.pages] def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]: """Export the document as XML (hOCR-format) diff --git a/doctr/utils/reconstitution.py b/doctr/utils/reconstitution.py index 82ae20cdd0..a229e9ddbc 100644 --- a/doctr/utils/reconstitution.py +++ b/doctr/utils/reconstitution.py @@ -2,6 +2,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. +import logging from typing import Any, Dict, Optional import numpy as np @@ -13,10 +14,109 @@ __all__ = ["synthesize_page", "synthesize_kie_page"] +# Global variable to avoid multiple warnings +ROTATION_WARNING = False + + +def _warn_rotation(entry: Dict[str, Any]) -> None: # pragma: no cover + global ROTATION_WARNING + if not ROTATION_WARNING and len(entry["geometry"]) == 4: + logging.warning("Polygons with larger rotations will lead to inaccurate rendering") + ROTATION_WARNING = True + + +def _synthesize( + response: Image.Image, + entry: Dict[str, Any], + w: int, + h: int, + draw_proba: bool = False, + font_family: Optional[str] = None, + smoothing_factor: float = 0.75, + min_font_size: int = 6, + max_font_size: int = 50, +) -> Image.Image: + if len(entry["geometry"]) == 2: + (xmin, ymin), (xmax, ymax) = entry["geometry"] + polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)] + else: + polygon = entry["geometry"] + + # Calculate the bounding box of the word + x_coords, y_coords = zip(*polygon) + xmin, ymin, xmax, ymax = ( + int(round(w * min(x_coords))), + int(round(h * min(y_coords))), + int(round(w * max(x_coords))), + int(round(h * max(y_coords))), + ) + word_width = xmax - xmin + word_height = ymax - ymin + + # If lines are provided instead of words, concatenate the word entries + if "words" in entry: + word_text = " ".join(word["value"] for word in entry["words"]) + else: + word_text = entry["value"] + # Find the optimal font size + try: + font_size = min(word_height, max_font_size) + font = get_font(font_family, font_size) + text_width, text_height = font.getbbox(word_text)[2:4] + + while (text_width > word_width or text_height > word_height) and font_size > min_font_size: + font_size = max(int(font_size * smoothing_factor), min_font_size) + font = get_font(font_family, font_size) + text_width, text_height = font.getbbox(word_text)[2:4] + except ValueError: + font = get_font(font_family, min_font_size) + + # Create a mask for the word + mask = Image.new("L", (w, h), 0) + ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255) + + # Draw the word text + d = ImageDraw.Draw(response) + try: + try: + d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt") + except UnicodeEncodeError: + d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt") + # Catch generic exceptions to avoid crashing the whole rendering + except Exception: # pragma: no cover + logging.warning(f"Could not render word: {word_text}") + + if draw_proba: + confidence = ( + entry["confidence"] + if "confidence" in entry + else sum(w["confidence"] for w in entry["words"]) / len(entry["words"]) + ) + p = int(255 * confidence) + color = (255 - p, 0, p) # Red to blue gradient based on probability + d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2) + + prob_font = get_font(font_family, 20) + prob_text = f"{confidence:.2f}" + prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4] + + # Position the probability slightly above the bounding box + prob_x_offset = (word_width - prob_text_width) // 2 + prob_y_offset = ymin - prob_text_height - 2 + prob_y_offset = max(0, prob_y_offset) + + d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt") + + return response + + def synthesize_page( page: Dict[str, Any], draw_proba: bool = False, font_family: Optional[str] = None, + smoothing_factor: float = 0.95, + min_font_size: int = 8, + max_font_size: int = 50, ) -> np.ndarray: """Draw a the content of the element page (OCR response) on a blank page. @@ -24,8 +124,10 @@ def synthesize_page( ---- page: exported Page object to represent draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 font_family: family of the font + smoothing_factor: factor to smooth the font size + min_font_size: minimum font size + max_font_size: maximum font size Returns: ------- @@ -33,41 +135,42 @@ def synthesize_page( """ # Draw template h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + response = Image.new("RGB", (w, h), color=(255, 255, 255)) - # Draw each word for block in page["blocks"]: - for line in block["lines"]: - for word in line["words"]: - # Get absolute word geometry - (xmin, ymin), (xmax, ymax) = word["geometry"] - xmin, xmax = int(round(w * xmin)), int(round(w * xmax)) - ymin, ymax = int(round(h * ymin)), int(round(h * ymax)) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - font = get_font(font_family, int(0.75 * (ymax - ymin))) - img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - # Draw in black the value of the word - try: - d.text((0, 0), word["value"], font=font, fill=(0, 0, 0)) - except UnicodeEncodeError: - # When character cannot be encoded, use its anyascii version - d.text((0, 0), anyascii(word["value"]), font=font, fill=(0, 0, 0)) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * word["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba: np.ndarray = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + # If lines are provided use these to get better rendering results + if len(block["lines"]) > 1: + for line in block["lines"]: + _warn_rotation(block) # pragma: no cover + response = _synthesize( + response=response, + entry=line, + w=w, + h=h, + draw_proba=draw_proba, + font_family=font_family, + smoothing_factor=smoothing_factor, + min_font_size=min_font_size, + max_font_size=max_font_size, + ) + # Otherwise, draw each word + else: + for line in block["lines"]: + _warn_rotation(block) # pragma: no cover + for word in line["words"]: + response = _synthesize( + response=response, + entry=word, + w=w, + h=h, + draw_proba=draw_proba, + font_family=font_family, + smoothing_factor=smoothing_factor, + min_font_size=min_font_size, + max_font_size=max_font_size, + ) + + return np.array(response, dtype=np.uint8) def synthesize_kie_page( @@ -81,8 +184,10 @@ def synthesize_kie_page( ---- page: exported Page object to represent draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 font_family: family of the font + smoothing_factor: factor to smooth the font size + min_font_size: minimum font size + max_font_size: maximum font size Returns: ------- @@ -90,37 +195,18 @@ def synthesize_kie_page( """ # Draw template h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + response = Image.new("RGB", (w, h), color=(255, 255, 255)) # Draw each word for predictions in page["predictions"].values(): for prediction in predictions: - # Get aboslute word geometry - (xmin, ymin), (xmax, ymax) = prediction["geometry"] - xmin, xmax = int(round(w * xmin)), int(round(w * xmax)) - ymin, ymax = int(round(h * ymin)), int(round(h * ymax)) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - font = get_font(font_family, int(0.75 * (ymax - ymin))) - img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - # Draw in black the value of the word - try: - d.text((0, 0), prediction["value"], font=font, fill=(0, 0, 0)) - except UnicodeEncodeError: - # When character cannot be encoded, use its anyascii version - d.text((0, 0), anyascii(prediction["value"]), font=font, fill=(0, 0, 0)) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * prediction["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba: np.ndarray = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + _warn_rotation(prediction) # pragma: no cover + response = _synthesize( + response=response, + entry=prediction, + w=w, + h=h, + draw_proba=draw_proba, + font_family=font_family, + ) + return np.array(response, dtype=np.uint8) diff --git a/tests/common/test_utils_reconstitution.py b/tests/common/test_utils_reconstitution.py index 3b70e67070..be98db89b2 100644 --- a/tests/common/test_utils_reconstitution.py +++ b/tests/common/test_utils_reconstitution.py @@ -1,12 +1,44 @@ import numpy as np -from test_io_elements import _mock_pages +from test_io_elements import _mock_kie_pages, _mock_pages from doctr.utils import reconstitution def test_synthesize_page(): pages = _mock_pages() - reconstitution.synthesize_page(pages[0].export(), draw_proba=False) - render = reconstitution.synthesize_page(pages[0].export(), draw_proba=True) - assert isinstance(render, np.ndarray) - assert render.shape == (*pages[0].dimensions, 3) + # Test without probability rendering + render_no_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=False) + assert isinstance(render_no_proba, np.ndarray) + assert render_no_proba.shape == (*pages[0].dimensions, 3) + + # Test with probability rendering + render_with_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=True) + assert isinstance(render_with_proba, np.ndarray) + assert render_with_proba.shape == (*pages[0].dimensions, 3) + + # Test with only one line + pages_one_line = pages[0].export() + pages_one_line["blocks"][0]["lines"] = [pages_one_line["blocks"][0]["lines"][0]] + render_one_line = reconstitution.synthesize_page(pages_one_line, draw_proba=True) + assert isinstance(render_one_line, np.ndarray) + assert render_one_line.shape == (*pages[0].dimensions, 3) + + # Test with polygons + pages_poly = pages[0].export() + pages_poly["blocks"][0]["lines"][0]["geometry"] = [(0, 0), (0, 1), (1, 1), (1, 0)] + render_poly = reconstitution.synthesize_page(pages_poly, draw_proba=True) + assert isinstance(render_poly, np.ndarray) + assert render_poly.shape == (*pages[0].dimensions, 3) + + +def test_synthesize_kie_page(): + pages = _mock_kie_pages() + # Test without probability rendering + render_no_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=False) + assert isinstance(render_no_proba, np.ndarray) + assert render_no_proba.shape == (*pages[0].dimensions, 3) + + # Test with probability rendering + render_with_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=True) + assert isinstance(render_with_proba, np.ndarray) + assert render_with_proba.shape == (*pages[0].dimensions, 3)