From 011a1b973ba487d2caf28da982ae6eff2507aa1c Mon Sep 17 00:00:00 2001 From: myhloli Date: Thu, 17 Oct 2024 14:42:09 +0800 Subject: [PATCH] refactor(ocr):Increase the dilation factor in OCR to address the issue of word concatenation. - Remove unused functions such as split_long_words, ocr_mk_mm_markdown_with_para, etc. - Simplify ocr_mk_markdown_with_para_core_v2 by removing unnecessary language detection and word splitting logic- Remove wordninja dependency from requirements - Update ocr_model_init to include additional parameters for OCR model configuration --- magic_pdf/dict2md/ocr_mkcontent.py | 196 +---------------------------- magic_pdf/model/pdf_extract_kit.py | 6 +- requirements-docker.txt | 1 - requirements.txt | 1 - 4 files changed, 5 insertions(+), 199 deletions(-) diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index f438f6e6..9d57b312 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -1,6 +1,5 @@ import re -import wordninja from loguru import logger from magic_pdf.libs.commons import join_path @@ -25,37 +24,6 @@ def __is_hyphen_at_line_end(line): return bool(re.search(r'[A-Za-z]+-\s*$', line)) -def split_long_words(text): - segments = text.split(' ') - for i in range(len(segments)): - words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE) - for j in range(len(words)): - if len(words[j]) > 10: - words[j] = ' '.join(wordninja.split(words[j])) - segments[i] = ''.join(words) - return ' '.join(segments) - - -def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path): - markdown = [] - for page_info in pdf_info_list: - paras_of_layout = page_info.get('para_blocks') - page_markdown = ocr_mk_markdown_with_para_core_v2( - paras_of_layout, 'mm', img_buket_path) - markdown.extend(page_markdown) - return '\n\n'.join(markdown) - - -def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list): - markdown = [] - for page_info in pdf_info_dict: - paras_of_layout = page_info.get('para_blocks') - page_markdown = ocr_mk_markdown_with_para_core_v2( - paras_of_layout, 'nlp') - markdown.extend(page_markdown) - return '\n\n'.join(markdown) - - def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path): markdown_with_para_and_pagination = [] @@ -76,45 +44,6 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, return markdown_with_para_and_pagination -def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''): - page_markdown = [] - for paras in paras_of_layout: - for para in paras: - para_text = '' - for line in para: - for span in line['spans']: - span_type = span.get('type') - content = '' - language = '' - if span_type == ContentType.Text: - content = span['content'] - language = detect_lang(content) - if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本 - content = ocr_escape_special_markdown_char( - split_long_words(content)) - else: - content = ocr_escape_special_markdown_char(content) - elif span_type == ContentType.InlineEquation: - content = f"${span['content']}$" - elif span_type == ContentType.InterlineEquation: - content = f"\n$$\n{span['content']}\n$$\n" - elif span_type in [ContentType.Image, ContentType.Table]: - if mode == 'mm': - content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n" - elif mode == 'nlp': - pass - if content != '': - if language == 'en': # 英文语境下 content间需要空格分隔 - para_text += content + ' ' - else: # 中文语境下,content间不需要空格分隔 - para_text += content - if para_text.strip() == '': - continue - else: - page_markdown.append(para_text.strip() + ' ') - return page_markdown - - def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path='', @@ -207,21 +136,11 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None): if line_text != '': line_lang = detect_lang(line_text) for span in line['spans']: + span_type = span['type'] content = '' if span_type == ContentType.Text: - content = span['content'] - # language = detect_lang(content) - language = detect_language(content) - # 判断是否小语种 - if lang is not None and lang != 'en': - content = ocr_escape_special_markdown_char(content) - else: # 非小语种逻辑 - if language == 'en' and parse_type == 'ocr': # 只对英文长词进行分词处理,中文分词会丢失文本 - content = ocr_escape_special_markdown_char( - split_long_words(content)) - else: - content = ocr_escape_special_markdown_char(content) + content = ocr_escape_special_markdown_char(span['content']) elif span_type == ContentType.InlineEquation: content = f" ${span['content']}$ " elif span_type == ContentType.InterlineEquation: @@ -242,41 +161,6 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None): return para_text -def para_to_standard_format(para, img_buket_path): - para_content = {} - if len(para) == 1: - para_content = line_to_standard_format(para[0], img_buket_path) - elif len(para) > 1: - para_text = '' - inline_equation_num = 0 - for line in para: - for span in line['spans']: - language = '' - span_type = span.get('type') - content = '' - if span_type == ContentType.Text: - content = span['content'] - language = detect_lang(content) - if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 - content = ocr_escape_special_markdown_char( - split_long_words(content)) - else: - content = ocr_escape_special_markdown_char(content) - elif span_type == ContentType.InlineEquation: - content = f"${span['content']}$" - inline_equation_num += 1 - if language == 'en': # 英文语境下 content间需要空格分隔 - para_text += content + ' ' - else: # 中文语境下,content间不需要空格分隔 - para_text += content - para_content = { - 'type': 'text', - 'text': para_text, - 'inline_equation_num': inline_equation_num, - } - return para_content - - def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None): para_type = para_block['type'] para_content = {} @@ -330,82 +214,6 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type= return para_content -def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str): - content_list = [] - for page_info in pdf_info_dict: - paras_of_layout = page_info.get('para_blocks') - if not paras_of_layout: - continue - for para_block in paras_of_layout: - para_content = para_to_standard_format_v2(para_block, - img_buket_path) - content_list.append(para_content) - return content_list - - -def line_to_standard_format(line, img_buket_path): - line_text = '' - inline_equation_num = 0 - for span in line['spans']: - if not span.get('content'): - if not span.get('image_path'): - continue - else: - if span['type'] == ContentType.Image: - content = { - 'type': 'image', - 'img_path': join_path(img_buket_path, - span['image_path']), - } - return content - elif span['type'] == ContentType.Table: - content = { - 'type': 'table', - 'img_path': join_path(img_buket_path, - span['image_path']), - } - return content - else: - if span['type'] == ContentType.InterlineEquation: - interline_equation = span['content'] - content = { - 'type': 'equation', - 'latex': f'$$\n{interline_equation}\n$$' - } - return content - elif span['type'] == ContentType.InlineEquation: - inline_equation = span['content'] - line_text += f'${inline_equation}$' - inline_equation_num += 1 - elif span['type'] == ContentType.Text: - text_content = ocr_escape_special_markdown_char( - span['content']) # 转义特殊符号 - line_text += text_content - content = { - 'type': 'text', - 'text': line_text, - 'inline_equation_num': inline_equation_num, - } - return content - - -def ocr_mk_mm_standard_format(pdf_info_dict: list): - """content_list type string - image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string - latex文本字段。 text string 纯文本格式的文本数据。 md string - markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg.""" - content_list = [] - for page_info in pdf_info_dict: - blocks = page_info.get('preproc_blocks') - if not blocks: - continue - for block in blocks: - for line in block['lines']: - content = line_to_standard_format(line) - content_list.append(content) - return content_list - - def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py index bca9b987..1e391104 100644 --- a/magic_pdf/model/pdf_extract_kit.py +++ b/magic_pdf/model/pdf_extract_kit.py @@ -77,11 +77,11 @@ def layout_model_init(weight, config_file, device): return model -def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None): +def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=2.4): if lang is not None: - model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang) + model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio) else: - model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh) + model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio) return model diff --git a/requirements-docker.txt b/requirements-docker.txt index 74b1a70b..0804ec9f 100644 --- a/requirements-docker.txt +++ b/requirements-docker.txt @@ -5,7 +5,6 @@ PyMuPDF>=1.24.9 loguru>=0.6.0 numpy>=1.21.6,<2.0.0 fast-langdetect==0.2.0 -wordninja>=2.0.0 scikit-learn>=1.0.2 pdfminer.six==20231228 unimernet==0.2.1 diff --git a/requirements.txt b/requirements.txt index d0bd653e..eced1426 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,6 @@ pdfminer.six==20231228 pydantic>=2.7.2,<2.8.0 PyMuPDF>=1.24.9 scikit-learn>=1.0.2 -wordninja>=2.0.0 torch>=2.2.2,<=2.3.1 transformers # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.