Skip to content

Commit

Permalink
Merge pull request #765 from myhloli/add-list-group
Browse files Browse the repository at this point in the history
refactor(para): improve paragraph splitting algorithm
  • Loading branch information
myhloli authored Oct 21, 2024
2 parents fe21eeb + 8cc76c4 commit e4904cd
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 34 deletions.
49 changes: 23 additions & 26 deletions magic_pdf/dict2md/ocr_mkcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
paras_of_layout, 'mm', img_buket_path)
markdown_with_para_and_pagination.append({
'page_no':
page_no,
page_no,
'md_content':
'\n\n'.join(page_markdown)
'\n\n'.join(page_markdown)
})
page_no += 1
return markdown_with_para_and_pagination
Expand All @@ -47,19 +47,17 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
mode,
img_buket_path='',
parse_type="auto",
lang=None
):
page_markdown = []
for para_block in paras_of_layout:
para_text = ''
para_type = para_block['type']
if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
para_text = merge_para_with_text(para_block, parse_type=parse_type, lang=lang)
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title:
para_text = f'# {merge_para_with_text(para_block, parse_type=parse_type, lang=lang)}'
para_text = f'# {merge_para_with_text(para_block)}'
elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block, parse_type=parse_type, lang=lang)
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Image:
if mode == 'nlp':
continue
Expand All @@ -72,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageFootnote:
para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_text += merge_para_with_text(block)
elif para_type == BlockType.Table:
if mode == 'nlp':
continue
elif mode == 'mm':
for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼table_body
if block['type'] == BlockType.TableBody:
for line in block['lines']:
Expand All @@ -97,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_text += merge_para_with_text(block)

if para_text.strip() == '':
continue
Expand All @@ -120,7 +118,7 @@ def detect_language(text):
return 'empty'


def merge_para_with_text(para_block, parse_type="auto", lang=None):
def merge_para_with_text(para_block):
para_text = ''
for i, line in enumerate(para_block['lines']):

Expand Down Expand Up @@ -161,24 +159,24 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
return para_text


def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None):
def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None):
para_type = para_block['type']
para_content = {}
if para_type == BlockType.Text:
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
'text': merge_para_with_text(para_block),
}
elif para_type == BlockType.Title:
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
'text': merge_para_with_text(para_block),
'text_level': 1,
}
elif para_type == BlockType.InterlineEquation:
para_content = {
'type': 'equation',
'text': merge_para_with_text(para_block, parse_type=parse_type, lang=lang),
'text': merge_para_with_text(para_block),
'text_format': 'latex',
}
elif para_type == BlockType.Image:
Expand All @@ -189,9 +187,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
img_buket_path,
block['lines'][0]['spans'][0]['image_path'])
if block['type'] == BlockType.ImageCaption:
para_content['img_caption'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_content['img_caption'] = merge_para_with_text(block)
if block['type'] == BlockType.ImageFootnote:
para_content['img_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_content['img_footnote'] = merge_para_with_text(block)
elif para_type == BlockType.Table:
para_content = {'type': 'table'}
for block in para_block['blocks']:
Expand All @@ -202,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
if block['type'] == BlockType.TableCaption:
para_content['table_caption'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_content['table_caption'] = merge_para_with_text(block)
if block['type'] == BlockType.TableFootnote:
para_content['table_footnote'] = merge_para_with_text(block, parse_type=parse_type, lang=lang)
para_content['table_footnote'] = merge_para_with_text(block)

para_content['page_idx'] = page_idx

Expand All @@ -218,8 +216,7 @@ def union_make(pdf_info_dict: list,
make_mode: str,
drop_mode: str,
img_buket_path: str = '',
parse_type: str = "auto",
lang=None):
):
output_content = []
for page_info in pdf_info_dict:
drop_reason_flag = False
Expand All @@ -246,20 +243,20 @@ def union_make(pdf_info_dict: list,
continue
if make_mode == MakeMode.MM_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path, parse_type=parse_type, lang=lang)
paras_of_layout, 'mm', img_buket_path)
output_content.extend(page_markdown)
elif make_mode == MakeMode.NLP_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp', parse_type=parse_type, lang=lang)
paras_of_layout, 'nlp')
output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout:
if drop_reason_flag:
para_content = para_to_standard_format_v2(
para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang, drop_reason=drop_reason)
para_block, img_buket_path, page_idx)
else:
para_content = para_to_standard_format_v2(
para_block, img_buket_path, page_idx, parse_type=parse_type, lang=lang)
para_block, img_buket_path, page_idx)
output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content)
Expand Down
17 changes: 15 additions & 2 deletions magic_pdf/para/para_split_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __is_list_or_index_block(block):
# index block 是一种特殊的list block
# 一个block如果是index block 应该同时满足以下特征
# 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字
if len(block['lines']) >= 3:
if len(block['lines']) >= 2:
first_line = block['lines'][0]
line_height = first_line['bbox'][3] - first_line['bbox'][1]
block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
Expand Down Expand Up @@ -227,6 +227,15 @@ def __merge_2_list_blocks(block1, block2):
return block1, block2


def __is_list_group(text_blocks_group):
# list group的特征是一个group内的所有block都满足以下条件
# 1.每个block都不超过3行 2. 每个block 的左边界都比较接近(逻辑简单点先不加这个规则)
for block in text_blocks_group:
if len(block['lines']) > 3:
return False
return True


def __para_merge_page(blocks):
page_text_blocks_groups = __process_blocks(blocks)
for text_blocks_group in page_text_blocks_groups:
Expand All @@ -239,6 +248,10 @@ def __para_merge_page(blocks):
# logger.info(f"{block['type']}:{block}")

if len(text_blocks_group) > 1:

# 在合并前判断这个group 是否是一个 list group
is_list_group = __is_list_group(text_blocks_group)

# 倒序遍历
for i in range(len(text_blocks_group) - 1, -1, -1):
current_block = text_blocks_group[i]
Expand All @@ -247,7 +260,7 @@ def __para_merge_page(blocks):
if i - 1 >= 0:
prev_block = text_blocks_group[i - 1]

if current_block['type'] == 'text' and prev_block['type'] == 'text':
if current_block['type'] == 'text' and prev_block['type'] == 'text' and not is_list_group:
__merge_2_text_blocks(current_block, prev_block)
elif (
(current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List) or
Expand Down
8 changes: 2 additions & 6 deletions magic_pdf/pipe/AbsPipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,7 @@ def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=D
"""
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"]
parse_type = pdf_mid_data["_parse_type"]
lang = pdf_mid_data.get("_lang", None)
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path, parse_type, lang)
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
return content_list

@staticmethod
Expand All @@ -107,9 +105,7 @@ def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=Dro
"""
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"]
parse_type = pdf_mid_data["_parse_type"]
lang = pdf_mid_data.get("_lang", None)
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path, parse_type, lang)
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
return md_content


0 comments on commit e4904cd

Please sign in to comment.