From 149132d60861f95ac2334564de63c98de20687e9 Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 1 Nov 2024 15:21:16 +0800 Subject: [PATCH] feat(pdf_parse): improve span filtering and add new block types - Refactor remove_outside_spans function to filter spans more accurately - Add image_footnote, index, and list block types to output file documentation - Update draw_span_bbox to use preproc_blocks instead of para_blocks - Bump version to 0.9.0 --- docs/output_file_en_us.md | 3 ++ docs/output_file_zh_cn.md | 13 +++--- magic_pdf/libs/draw_bbox.py | 3 +- magic_pdf/libs/version.py | 2 +- magic_pdf/pdf_parse_union_core_v2.py | 64 +++++++++++++++------------- 5 files changed, 49 insertions(+), 36 deletions(-) diff --git a/docs/output_file_en_us.md b/docs/output_file_en_us.md index 1fe9a127..995fdb2a 100644 --- a/docs/output_file_en_us.md +++ b/docs/output_file_en_us.md @@ -175,11 +175,14 @@ Detailed explanation of second-level block types | :----------------- | :--------------------- | | image_body | Main body of the image | | image_caption | Image description text | +| image_footnote | Image footnote | | table_body | Main body of the table | | table_caption | Table description text | | table_footnote | Table footnote | | text | Text block | | title | Title block | +| index | Index block | +| list | List block | | interline_equation | Block formula |
diff --git a/docs/output_file_zh_cn.md b/docs/output_file_zh_cn.md index dd921da9..899b539f 100644 --- a/docs/output_file_zh_cn.md +++ b/docs/output_file_zh_cn.md @@ -174,12 +174,15 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、 | :----------------- | :------------- | | image_body | 图像的本体 | | image_caption | 图像的描述文本 | -| table_body | 表格本体 | +| image_footnote | 图像的脚注 | +| table_body | 表格本体 | | table_caption | 表格的描述文本 | -| table_footnote | 表格的脚注 | -| text | 文本块 | -| title | 标题块 | -| interline_equation | 行间公式块 | +| table_footnote | 表格的脚注 | +| text | 文本块 | +| title | 标题块 | +| index | 目录块 | +| list | 列表块 | +| interline_equation | 行间公式块 |
diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py index 9703e131..e0c769f3 100644 --- a/magic_pdf/libs/draw_bbox.py +++ b/magic_pdf/libs/draw_bbox.py @@ -249,7 +249,8 @@ def get_span_info(span): page_dropped_list.append(span['bbox']) dropped_list.append(page_dropped_list) # 构造其余useful_list - for block in page['para_blocks']: + # for block in page['para_blocks']: # span直接用分段合并前的结果就可以 + for block in page['preproc_blocks']: if block['type'] in [ BlockType.Text, BlockType.Title, diff --git a/magic_pdf/libs/version.py b/magic_pdf/libs/version.py index 777f190d..3e2f46a3 100644 --- a/magic_pdf/libs/version.py +++ b/magic_pdf/libs/version.py @@ -1 +1 @@ -__version__ = "0.8.0" +__version__ = "0.9.0" diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index f7fa272c..4a3dc55b 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -382,39 +382,44 @@ def revert_group_blocks(blocks): return new_blocks -def remove_outside_spans(spans, all_bboxes): - image_bboxes = [] - table_bboxes = [] - other_block_bboxes = [] - for block in all_bboxes: - block_type = block[7] - block_bbox = block[0:4] - - if block_type == BlockType.ImageBody: - image_bboxes.append(block_bbox) - elif block_type == BlockType.TableBody: - table_bboxes.append(block_bbox) - else: - other_block_bboxes.append(block_bbox) +def remove_outside_spans(spans, all_bboxes, all_discarded_blocks): + def get_block_bboxes(blocks, block_type_list): + return [block[0:4] for block in blocks if block[7] in block_type_list] + + image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody]) + table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody]) + other_block_type = [] + for block_type in BlockType.__dict__.values(): + if not isinstance(block_type, str): + continue + if block_type not in [BlockType.ImageBody, BlockType.TableBody]: + other_block_type.append(block_type) + other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type) + discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded]) new_spans = [] for span in spans: - if span['type'] == ContentType.Image: - for block_bbox in image_bboxes: - if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5: - new_spans.append(span) - break - elif span['type'] == ContentType.Table: - for block_bbox in table_bboxes: - if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5: - new_spans.append(span) - break + span_bbox = span['bbox'] + span_type = span['type'] + + if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in + discarded_block_bboxes): + new_spans.append(span) + continue + + if span_type == ContentType.Image: + if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in + image_bboxes): + new_spans.append(span) + elif span_type == ContentType.Table: + if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in + table_bboxes): + new_spans.append(span) else: - for block_bbox in other_block_bboxes: - if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5: - new_spans.append(span) - break + if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in + other_block_bboxes): + new_spans.append(span) return new_spans @@ -488,7 +493,8 @@ def parse_page_core( raise Exception('parse_mode must be txt or ocr') """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span""" - spans = remove_outside_spans(spans, all_bboxes) + """顺便删除大水印并保留abandon的span""" + spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks) """删除重叠spans中置信度较低的那些""" spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)