Filter after merge and skip text thats already in the target language

zyddnys · Nov 19, 2023 · 45001cc · 45001cc
1 parent 5cec578
commit 45001cc
Show file tree

Hide file tree

Showing 7 changed files with 49 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -398,7 +398,7 @@ THA: Thai
 --detector {default,ctd,craft,none}          Text detector used for creating a text mask from an
                                              image, DO NOT use craft for manga, it's not designed
                                              for it
---ocr {48px,32px,48px_ctc}                   Optical character recognition (OCR) model to use
+--ocr {32px,48px,48px_ctc}                   Optical character recognition (OCR) model to use
 --inpainter {default,lama_large,lama_mpe,sd,none,original}
                                              Inpainting model to use
 --upscaler {waifu2x,esrgan,4xultrasharp}     Upscaler to use. --upscale-ratio has to be set for it
@@ -431,10 +431,11 @@ THA: Thai
 --box-threshold BOX_THRESHOLD                Threshold for bbox generation
 --text-threshold TEXT_THRESHOLD              Threshold for text detection
 --min-text-length MIN_TEXT_LENGTH            Minimum text length of a text region
+--no-text-lang-skip                          Dont skip text that is seemingly already in the target
+                                             language.
 --inpainting-size INPAINTING_SIZE            Size of image used for inpainting (too large will
                                              result in OOM)
---inpainting-precision INPAINTING_PRECISION  Inpainting precision for lama, 
-                                             use bf16 while you can.
+--inpainting-precision {fp32,fp16,bf16}      Inpainting precision for lama, use bf16 while you can.
 --colorization-size COLORIZATION_SIZE        Size of image used for colorization. Set to -1 to use
                                              full image size
 --denoise-sigma DENOISE_SIGMA                Used by colorizer and affects color strength, range

diff --git a/README_CN.md b/README_CN.md
@@ -130,7 +130,7 @@ THA: Thai
 --detector {default,ctd,craft,none}          Text detector used for creating a text mask from an
                                              image, DO NOT use craft for manga, it's not designed
                                              for it
---ocr {48px,32px,48px_ctc}                   Optical character recognition (OCR) model to use
+--ocr {32px,48px,48px_ctc}                   Optical character recognition (OCR) model to use
 --inpainter {default,lama_large,lama_mpe,sd,none,original}
                                              Inpainting model to use
 --upscaler {waifu2x,esrgan,4xultrasharp}     Upscaler to use. --upscale-ratio has to be set for it
@@ -163,10 +163,11 @@ THA: Thai
 --box-threshold BOX_THRESHOLD                Threshold for bbox generation
 --text-threshold TEXT_THRESHOLD              Threshold for text detection
 --min-text-length MIN_TEXT_LENGTH            Minimum text length of a text region
+--no-text-lang-skip                          Dont skip text that is seemingly already in the target
+                                             language.
 --inpainting-size INPAINTING_SIZE            Size of image used for inpainting (too large will
                                              result in OOM)
---inpainting-precision INPAINTING_PRECISION  Inpainting precision for lama, 
-                                             use bf16 while you can.
+--inpainting-precision {fp32,fp16,bf16}      Inpainting precision for lama, use bf16 while you can.
 --colorization-size COLORIZATION_SIZE        Size of image used for colorization. Set to -1 to use
                                              full image size
 --denoise-sigma DENOISE_SIGMA                Used by colorizer and affects color strength, range

diff --git a/manga_translator/args.py b/manga_translator/args.py
@@ -125,6 +125,7 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
 parser.add_argument('--box-threshold', default=0.7, type=float, help='Threshold for bbox generation')
 parser.add_argument('--text-threshold', default=0.5, type=float, help='Threshold for text detection')
 parser.add_argument('--min-text-length', default=0, type=int, help='Minimum text length of a text region')
+parser.add_argument('--no-text-lang-skip', action='store_true', help='Dont skip text that is seemingly already in the target language.')
 parser.add_argument('--inpainting-size', default=2048, type=int, help='Size of image used for inpainting (too large will result in OOM)')
 parser.add_argument('--inpainting-precision', default='fp32', type=str, help='Inpainting precision for lama, use bf16 while you can.', choices=['fp32', 'fp16', 'bf16'])
 parser.add_argument('--colorization-size', default=576, type=int, help='Size of image used for colorization. Set to -1 to use full image size')

diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py
@@ -5,7 +5,7 @@
 import cv2
 from aiohttp.web_middlewares import middleware
 from omegaconf import OmegaConf
-import py3langid as langid
+import langcodes
 import requests
 import os
 import re
@@ -469,15 +469,9 @@ async def _run_detection(self, ctx: Context):
     async def _run_ocr(self, ctx: Context):
         textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, ctx, self.device, self.verbose)
 
-        # Filter out regions by original text
         new_textlines = []
         for textline in textlines:
-            text = textline.text
-            if (ctx.filter_text and re.search(ctx.filter_text, text)) \
-                    or not is_valuable_text(text):
-                if text.strip():
-                    logger.info(f'Filtered out: {text}')
-            else:
+            if textline.text.strip():
                 if ctx.font_color_fg:
                     textline.fg_r, textline.fg_g, textline.fg_b = ctx.font_color_fg
                 if ctx.font_color_bg:
@@ -488,23 +482,31 @@ async def _run_ocr(self, ctx: Context):
     async def _run_textline_merge(self, ctx: Context):
         text_regions = await dispatch_textline_merge(ctx.textlines, ctx.img_rgb.shape[1], ctx.img_rgb.shape[0],
                                                      verbose=self.verbose)
-        text_regions = [region for region in text_regions if len(''.join(region.text)) >= ctx.min_text_length]
-
+        new_text_regions = []
         for region in text_regions:
-            if ctx.font_color_fg or ctx.font_color_bg:
-                if ctx.font_color_bg:
-                    region.adjust_bg_color = False
+            if len(region.text) >= ctx.min_text_length \
+                    and not is_valuable_text(region.text) \
+                    or (not ctx.no_text_lang_skip and langcodes.tag_distance(region.source_lang, ctx.target_lang) == 0):
+                if region.text.strip():
+                    logger.info(f'Filtered out: {region.text}')
+            else:
+                if ctx.font_color_fg or ctx.font_color_bg:
+                    if ctx.font_color_bg:
+                        region.adjust_bg_color = False
+                new_text_regions.append(region)
+        text_regions = new_text_regions
 
         # Sort ctd (comic text detector) regions left to right. Otherwise right to left.
         # Sorting will improve text translation quality.
         text_regions = sort_regions(text_regions, right_to_left=True if ctx.detector != 'ctd' else False)
         return text_regions
 
     async def _run_text_translation(self, ctx: Context):
-        translated_sentences = await dispatch_translation(ctx.translator,
-                                                          [region.get_text() for region in ctx.text_regions],
-                                                          ctx.use_mtpe,
-                                                          ctx, 'cpu' if self._cuda_limited_memory else self.device)
+        translated_sentences = \
+            await dispatch_translation(ctx.translator,
+                                       [region.text for region in ctx.text_regions],
+                                       ctx.use_mtpe,
+                                       ctx, 'cpu' if self._cuda_limited_memory else self.device)
 
         for region, translation in zip(ctx.text_regions, translated_sentences):
             if ctx.uppercase:
@@ -521,8 +523,8 @@ async def _run_text_translation(self, ctx: Context):
         for region in ctx.text_regions:
             # TODO: Maybe print reasons for filtering
             if not ctx.translator == 'none' and (region.translation.isnumeric() \
-                                                 or ctx.filter_text and re.search(ctx.filter_text, region.translation)
-                                                 or not ctx.translator == 'original' and region.get_text().lower().strip() == region.translation.lower().strip()):
+                    or ctx.filter_text and re.search(ctx.filter_text, region.translation)
+                    or not ctx.translator == 'original' and region.text.lower().strip() == region.translation.lower().strip()):
                 if region.translation.strip():
                     logger.info(f'Filtered out: {region.translation}')
             else:
@@ -618,7 +620,7 @@ def identify_colors(fg_rgb: List[int]):
 
             s += f'\n-- {i + 1} --\n'
             s += f'color: #{color_id}: {color_name} (fg, bg: {rgb2hex(*fore)} {rgb2hex(*back)})\n'
-            s += f'text:  {region.get_text()}\n'
+            s += f'text:  {region.text}\n'
             s += f'trans: {region.translation}\n'
             for line in region.lines:
                 s += f'coords: {list(line.ravel())}\n'
@@ -743,7 +745,7 @@ async def _run_text_translation(self, ctx: Context):
             requests.post(f'http://{self.host}:{self.port}/request-manual-internal', json={
                 'task_id': self._task_id,
                 'nonce': self.nonce,
-                'texts': [r.get_text() for r in text_regions],
+                'texts': [r.text for r in text_regions],
                 'translations': [r.translation for r in text_regions],
             }, timeout=20)
 
@@ -1225,7 +1227,7 @@ def format_translate(self, ctx: Context, return_image: bool):
                 trans = {key: value[i] for key, value in ctx['translations'].items()}
             else:
                 trans = {}
-            trans["originalText"] = text_regions[i].get_text()
+            trans["originalText"] = text_regions[i].text
             if inpaint is not None:
                 overlay = inpaint[minY:maxY, minX:maxX]
 
@@ -1248,7 +1250,7 @@ def format_translate(self, ctx: Context, return_image: bool):
                     'fg': color1.tolist(),
                     'bg': color2.tolist()
                 },
-                'language': langid.classify(text_regions[i].get_text())[0],
+                'language': text_regions[i].source_lang,
                 'background': background
             })
         if return_image and ctx.img_colorized is not None:

diff --git a/manga_translator/rendering/__init__.py b/manga_translator/rendering/__init__.py
@@ -41,7 +41,7 @@ def resize_regions_to_font_size(img: np.ndarray, text_regions: List[TextBlock],
 
     dst_points_list = []
     for region in text_regions:
-        char_count_orig = len(region.get_text())
+        char_count_orig = len(region.text)
         char_count_trans = len(region.translation.strip())
         if char_count_trans > char_count_orig:
             # More characters were added, have to reduce fontsize to fit allotted area

diff --git a/manga_translator/utils/generic.py b/manga_translator/utils/generic.py
@@ -118,7 +118,7 @@ def is_punctuation(ch):
 
 def is_valuable_char(ch):
     # return re.search(r'[^\d\W]', ch)
-    return not is_punctuation(ch) and not is_control(ch) and not is_whitespace(ch) and not ch.isnumeric()
+    return not is_punctuation(ch) and not is_control(ch) and not is_whitespace(ch) and not ch.isdigit()
 
 def is_valuable_text(text):
     for ch in text:

diff --git a/manga_translator/utils/textblock.py b/manga_translator/utils/textblock.py
@@ -5,6 +5,7 @@
 from functools import cached_property
 import copy
 import re
+import py3langid as langid
 
 from .generic import color_difference, is_right_to_left_char, is_valuable_char
 # from ..detection.ctd_utils.utils.imgproc_utils import union_area, xywh2xyxypoly
@@ -41,7 +42,7 @@ class TextBlock(object):
     Object that stores a block of text made up of textlines.
     """
     def __init__(self, lines: List,
-                 text: List[str] = None,
+                 texts: List[str] = None,
                  language: str = 'unknown',
                  font_size: float = -1,
                  angle: int = 0,
@@ -60,6 +61,7 @@ def __init__(self, lines: List,
                  _bounding_rect: List = None,
                  default_stroke_width = 0.2,
                  font_weight = 50,
+                 source_lang: str = "",
                  target_lang: str = "",
                  opacity: float = 1.,
                  shadow_radius: float = 0.,
@@ -75,7 +77,8 @@ def __init__(self, lines: List,
         self.angle = angle
         self._direction = direction
 
-        self.text = text if text is not None else []
+        self.texts = texts if texts is not None else []
+        self.text = ' '.join(texts)
         self.prob = prob
 
         self.translation = translation
@@ -92,6 +95,7 @@ def __init__(self, lines: List,
         self.line_spacing = line_spacing
         self.letter_spacing = letter_spacing
         self._alignment = alignment
+        self._source_lang = source_lang
         self.target_lang = target_lang
 
         self._bounding_rect = _bounding_rect
@@ -235,10 +239,11 @@ def get_transformed_region(self, img: np.ndarray, line_idx: int, textheight: int
                 region = cv2.resize(region, (maxwidth, h))
         return region
 
-    def get_text(self):
-        if isinstance(self.text, str):
-            return self.text
-        return ' '.join(self.text).strip()
+    @property
+    def source_lang(self):
+        if not self._source_lang:
+            self._source_lang = langid.classify(self.text)[0]
+        return self._source_lang
 
     def get_translation_for_rendering(self):
         text = self.translation
@@ -275,7 +280,7 @@ def is_bulleted_list(self):
         A determining factor of whether we should be sticking to the strict per textline
         text distribution when rendering.
         """
-        if len(self.text) <= 1:
+        if len(self.texts) <= 1:
             return False
 
         bullet_regexes = [
@@ -284,7 +289,7 @@ def is_bulleted_list(self):
             r'[QA]:', # Q: ... A: ...
         ]
         bullet_type_idx = -1
-        for line_text in self.text:
+        for line_text in self.texts:
             for i, breg in enumerate(bullet_regexes):
                 if re.search(r'(?:[\n]|^)((?:' + breg + r')[\s]*)', line_text):
                     if bullet_type_idx >= 0 and bullet_type_idx != i: