feat(pdf_parse): improve span filtering and add new block types

- Refactor remove_outside_spans function to filter spans more accurately - Add image_footnote, index, and list block types to output file documentation - Update draw_span_bbox to use preproc_blocks instead of para_blocks - Bump version to 0.9.0
opendatalab · Nov 1, 2024 · 149132d · 149132d
1 parent ad0d06b
commit 149132d
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 36 deletions.
diff --git a/docs/output_file_en_us.md b/docs/output_file_en_us.md
@@ -175,11 +175,14 @@ Detailed explanation of second-level block types
 | :----------------- | :--------------------- |
 | image_body         | Main body of the image |
 | image_caption      | Image description text |
+| image_footnote     | Image footnote         |
 | table_body         | Main body of the table |
 | table_caption      | Table description text |
 | table_footnote     | Table footnote         |
 | text               | Text block             |
 | title              | Title block            |
+| index              | Index block            |
+| list               | List block             |
 | interline_equation | Block formula          |
 
 <br>

diff --git a/docs/output_file_zh_cn.md b/docs/output_file_zh_cn.md
@@ -174,12 +174,15 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
 | :----------------- | :------------- |
 | image_body         | 图像的本体     |
 | image_caption      | 图像的描述文本 |
-| table_body         | 表格本体       |
+| image_footnote     | 图像的脚注   |
+| table_body         | 表格本体    |
 | table_caption      | 表格的描述文本 |
-| table_footnote     | 表格的脚注     |
-| text               | 文本块         |
-| title              | 标题块         |
-| interline_equation | 行间公式块     |
+| table_footnote     | 表格的脚注   |
+| text               | 文本块     |
+| title              | 标题块     |
+| index              | 目录块     |
+| list               | 列表块     |
+| interline_equation | 行间公式块   |
 
 <br>
 

diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py
@@ -249,7 +249,8 @@ def get_span_info(span):
                         page_dropped_list.append(span['bbox'])
         dropped_list.append(page_dropped_list)
         # 构造其余useful_list
-        for block in page['para_blocks']:
+        # for block in page['para_blocks']:  # span直接用分段合并前的结果就可以
+        for block in page['preproc_blocks']:
             if block['type'] in [
                 BlockType.Text,
                 BlockType.Title,

diff --git a/magic_pdf/libs/version.py b/magic_pdf/libs/version.py
@@ -1 +1 @@
-__version__ = "0.8.0"
+__version__ = "0.9.0"
diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py
@@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
     return new_blocks
 
 
-def remove_outside_spans(spans, all_bboxes):
-    image_bboxes = []
-    table_bboxes = []
-    other_block_bboxes = []
-    for block in all_bboxes:
-        block_type = block[7]
-        block_bbox = block[0:4]
-
-        if block_type == BlockType.ImageBody:
-            image_bboxes.append(block_bbox)
-        elif block_type == BlockType.TableBody:
-            table_bboxes.append(block_bbox)
-        else:
-            other_block_bboxes.append(block_bbox)
+def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
+    def get_block_bboxes(blocks, block_type_list):
+        return [block[0:4] for block in blocks if block[7] in block_type_list]
+
+    image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody])
+    table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody])
+    other_block_type = []
+    for block_type in BlockType.__dict__.values():
+        if not isinstance(block_type, str):
+            continue
+        if block_type not in [BlockType.ImageBody, BlockType.TableBody]:
+            other_block_type.append(block_type)
+    other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
+    discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded])
 
     new_spans = []
 
     for span in spans:
-        if span['type'] == ContentType.Image:
-            for block_bbox in image_bboxes:
-                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
-                    new_spans.append(span)
-                    break
-        elif span['type'] == ContentType.Table:
-            for block_bbox in table_bboxes:
-                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
-                    new_spans.append(span)
-                    break
+        span_bbox = span['bbox']
+        span_type = span['type']
+
+        if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
+               discarded_block_bboxes):
+            new_spans.append(span)
+            continue
+
+        if span_type == ContentType.Image:
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
+                   image_bboxes):
+                new_spans.append(span)
+        elif span_type == ContentType.Table:
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
+                   table_bboxes):
+                new_spans.append(span)
         else:
-            for block_bbox in other_block_bboxes:
-                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
-                    new_spans.append(span)
-                    break
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
+                   other_block_bboxes):
+                new_spans.append(span)
 
     return new_spans
 
@@ -488,7 +493,8 @@ def parse_page_core(
         raise Exception('parse_mode must be txt or ocr')
 
     """在删除重复span之前，应该通过image_body和table_body的block过滤一下image和table的span"""
-    spans = remove_outside_spans(spans, all_bboxes)
+    """顺便删除大水印并保留abandon的span"""
+    spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
 
     """删除重叠spans中置信度较低的那些"""
     spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)