From 377b09cf8cbc3bb8006c94d99c169459d56d9f70 Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 28 Oct 2024 18:11:35 +0800 Subject: [PATCH] refactor(table): disable StructEqTable support and add TableMaster support - Remove import and usage of StructTableModel- Add support for TableMaster model- Update table model initialization logic to support TableMaster - Log error and exit if StructEqTable is selected, as it's under upgrade - Update README files to reflect changes in table parsing capabilities --- README.md | 2 +- README_zh-CN.md | 2 +- magic_pdf/model/pdf_extract_kit.py | 11 ++++++++--- .../pek_sub_modules/structeqtable/StructTableModel.py | 9 ++++++++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index cea8f060..400841b0 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts. - Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios. - Refactored the list and table of contents recognition functions, significantly improving the accuracy of list blocks and table of contents blocks, as well as the parsing of corresponding text paragraphs. - - Refactored the matching logic for figures, tables, and descriptive text, greatly enhancing the accuracy of matching captions and footnotes to figures and tables, and reducing the loss rate of descriptive text to zero. + - Refactored the matching logic for figures, tables, and descriptive text, greatly enhancing the accuracy of matching captions and footnotes to figures and tables, and reducing the loss rate of descriptive text to near zero. - Added multi-language support for OCR, supporting detection and recognition of 84 languages.For the list of supported languages, see [OCR Language Support List](https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations). - Added memory recycling logic and other memory optimization measures, significantly reducing memory usage. The memory requirement for enabling all acceleration features except table acceleration (layout/formula/OCR) has been reduced from 16GB to 8GB, and the memory requirement for enabling all acceleration features has been reduced from 24GB to 10GB. - Optimized configuration file feature switches, adding an independent formula detection switch to significantly improve speed and parsing results when formula detection is not needed. diff --git a/README_zh-CN.md b/README_zh-CN.md index bf6e802b..89eb2127 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -46,7 +46,7 @@ - 重构排序模块代码,使用 [layoutreader](https://github.com/ppaanngggg/layoutreader) 进行阅读顺序排序,确保在各种排版下都能实现极高准确率 - 重构段落拼接模块,在跨栏、跨页、跨图、跨表情况下均能实现良好的段落拼接效果 - 重构列表和目录识别功能,极大提升列表块和目录块识别的准确率及对应文本段落的解析效果 - - 重构图、表与描述性文本的匹配逻辑,大幅提升 caption 和 footnote 与图表的匹配准确率,并将描述性文本的丢失率降至零 + - 重构图、表与描述性文本的匹配逻辑,大幅提升 caption 和 footnote 与图表的匹配准确率,并将描述性文本的丢失率降至接近0 - 增加 OCR 的多语言支持,支持 84 种语言的检测与识别,语言支持列表详见 [OCR 语言支持列表](https://paddlepaddle.github.io/PaddleOCR/latest/ppocr/blog/multi_languages.html#5) - 增加显存回收逻辑及其他显存优化措施,大幅降低显存使用需求。开启除表格加速外的全部加速功能(layout/公式/OCR)的显存需求从16GB降至8GB,开启全部加速功能的显存需求从24GB降至10GB - 优化配置文件的功能开关,增加独立的公式检测开关,无需公式检测时可大幅提升速度和解析效果 diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py index f1478b10..ec67e3ed 100644 --- a/magic_pdf/model/pdf_extract_kit.py +++ b/magic_pdf/model/pdf_extract_kit.py @@ -38,19 +38,24 @@ from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR -from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel +# from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel from magic_pdf.model.ppTableModel import ppTableModel def table_model_init(table_model_type, model_path, max_time, _device_='cpu'): if table_model_type == MODEL_NAME.STRUCT_EQTABLE: - table_model = StructTableModel(model_path, max_time=max_time, device=_device_) - else: + # table_model = StructTableModel(model_path, max_time=max_time, device=_device_) + logger.error("StructEqTable is under upgrade, the current version does not support it.") + exit(1) + elif table_model_type == MODEL_NAME.TABLE_MASTER: config = { "model_dir": model_path, "device": _device_ } table_model = ppTableModel(config) + else: + logger.error("table model type not allow") + exit(1) return table_model diff --git a/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py b/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py index 2d1ce584..0d8a5971 100644 --- a/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +++ b/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py @@ -1,5 +1,12 @@ -from struct_eqtable.model import StructTable +from loguru import logger + +try: + from struct_eqtable.model import StructTable +except ImportError: + logger.error("StructEqTable is under upgrade, the current version does not support it.") from pypandoc import convert_text + + class StructTableModel: def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'): # init