Skip to content

Commit

Permalink
feat(pdf_parse): improve span filtering and add new block types
Browse files Browse the repository at this point in the history
- Refactor remove_outside_spans function to filter spans more accurately
- Add image_footnote, index, and list block types to output file documentation
- Update draw_span_bbox to use preproc_blocks instead of para_blocks
- Bump version to 0.9.0
  • Loading branch information
myhloli committed Nov 1, 2024
1 parent ad0d06b commit 149132d
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 36 deletions.
3 changes: 3 additions & 0 deletions docs/output_file_en_us.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,14 @@ Detailed explanation of second-level block types
| :----------------- | :--------------------- |
| image_body | Main body of the image |
| image_caption | Image description text |
| image_footnote | Image footnote |
| table_body | Main body of the table |
| table_caption | Table description text |
| table_footnote | Table footnote |
| text | Text block |
| title | Title block |
| index | Index block |
| list | List block |
| interline_equation | Block formula |

<br>
Expand Down
13 changes: 8 additions & 5 deletions docs/output_file_zh_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,15 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
| :----------------- | :------------- |
| image_body | 图像的本体 |
| image_caption | 图像的描述文本 |
| table_body | 表格本体 |
| image_footnote | 图像的脚注 |
| table_body | 表格本体 |
| table_caption | 表格的描述文本 |
| table_footnote | 表格的脚注 |
| text | 文本块 |
| title | 标题块 |
| interline_equation | 行间公式块 |
| table_footnote | 表格的脚注 |
| text | 文本块 |
| title | 标题块 |
| index | 目录块 |
| list | 列表块 |
| interline_equation | 行间公式块 |

<br>

Expand Down
3 changes: 2 additions & 1 deletion magic_pdf/libs/draw_bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,8 @@ def get_span_info(span):
page_dropped_list.append(span['bbox'])
dropped_list.append(page_dropped_list)
# 构造其余useful_list
for block in page['para_blocks']:
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
for block in page['preproc_blocks']:
if block['type'] in [
BlockType.Text,
BlockType.Title,
Expand Down
2 changes: 1 addition & 1 deletion magic_pdf/libs/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.0"
__version__ = "0.9.0"
64 changes: 35 additions & 29 deletions magic_pdf/pdf_parse_union_core_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
return new_blocks


def remove_outside_spans(spans, all_bboxes):
image_bboxes = []
table_bboxes = []
other_block_bboxes = []
for block in all_bboxes:
block_type = block[7]
block_bbox = block[0:4]

if block_type == BlockType.ImageBody:
image_bboxes.append(block_bbox)
elif block_type == BlockType.TableBody:
table_bboxes.append(block_bbox)
else:
other_block_bboxes.append(block_bbox)
def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
def get_block_bboxes(blocks, block_type_list):
return [block[0:4] for block in blocks if block[7] in block_type_list]

image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody])
table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody])
other_block_type = []
for block_type in BlockType.__dict__.values():
if not isinstance(block_type, str):
continue
if block_type not in [BlockType.ImageBody, BlockType.TableBody]:
other_block_type.append(block_type)
other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded])

new_spans = []

for span in spans:
if span['type'] == ContentType.Image:
for block_bbox in image_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span)
break
elif span['type'] == ContentType.Table:
for block_bbox in table_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span)
break
span_bbox = span['bbox']
span_type = span['type']

if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
discarded_block_bboxes):
new_spans.append(span)
continue

if span_type == ContentType.Image:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
image_bboxes):
new_spans.append(span)
elif span_type == ContentType.Table:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
table_bboxes):
new_spans.append(span)
else:
for block_bbox in other_block_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
new_spans.append(span)
break
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
other_block_bboxes):
new_spans.append(span)

return new_spans

Expand Down Expand Up @@ -488,7 +493,8 @@ def parse_page_core(
raise Exception('parse_mode must be txt or ocr')

"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
spans = remove_outside_spans(spans, all_bboxes)
"""顺便删除大水印并保留abandon的span"""
spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)

"""删除重叠spans中置信度较低的那些"""
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
Expand Down

0 comments on commit 149132d

Please sign in to comment.