From 81b9fd7bdb480ccffe33b62f54ffe14730c63cb3 Mon Sep 17 00:00:00 2001 From: myhloli Date: Tue, 15 Oct 2024 22:32:43 +0800 Subject: [PATCH] refactor(para_split_v3): refine list block detection in paragraph splitting - Update list block detection logic to require at least 2 numeric start lines - Ensure the number of numeric start lines matches the number of end lines - Remove detection of non-border starting lines for simplicity --- magic_pdf/para/para_split_v3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magic_pdf/para/para_split_v3.py b/magic_pdf/para/para_split_v3.py index 7a26b2d8..058b0343 100644 --- a/magic_pdf/para/para_split_v3.py +++ b/magic_pdf/para/para_split_v3.py @@ -166,7 +166,7 @@ def __is_list_or_index_block(block): line[ListLineTag.IS_LIST_END_LINE] = True line_start_flag = True # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致 - elif num_start_count == flag_end_count: # 简单一点先不考虑左侧不贴边的情况 + elif num_start_count >= 2 and num_start_count == flag_end_count: # 简单一点先不考虑左侧不贴边的情况 for i, line in enumerate(block['lines']): if lines_text_list[i][0].isdigit(): line[ListLineTag.IS_LIST_START_LINE] = True