From df14c61f6fd95328bcd75a470205d0459941e73a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Wed, 19 Jun 2024 16:03:21 +0800 Subject: [PATCH] update: Enhance the capability to detect garbled document issues --- magic_pdf/filter/pdf_classify_by_type.py | 10 +++- magic_pdf/filter/pdf_meta_scan.py | 48 +++++++++++----- magic_pdf/libs/pdf_check.py | 59 +++++++++++++++++++ magic_pdf/pipe/AbsPipe.py | 1 + magic_pdf/user_api.py | 73 ++++++++++++------------ requirements.txt | 3 +- 6 files changed, 140 insertions(+), 54 deletions(-) create mode 100644 magic_pdf/libs/pdf_check.py diff --git a/magic_pdf/filter/pdf_classify_by_type.py b/magic_pdf/filter/pdf_classify_by_type.py index d6f3eee5..319fb3fb 100644 --- a/magic_pdf/filter/pdf_classify_by_type.py +++ b/magic_pdf/filter/pdf_classify_by_type.py @@ -305,7 +305,7 @@ def is_narrow_strip(img): def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, - text_layout_list: list): + text_layout_list: list, invalid_chars: bool): """ 这里的图片和页面长度单位是pts :param total_page: @@ -322,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l 'by_avg_words': classify_by_avg_words(text_len_list), 'by_img_num': classify_by_img_num(img_sz_list, img_num_list), 'by_text_layout': classify_by_text_layout(text_layout_list), - 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list) + 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list), + 'by_invalid_chars': invalid_chars, } if all(results.values()): @@ -331,7 +332,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l return False, results else: logger.warning( - f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", + f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}," + f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}," + f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}," + f" by_invalid_chars: {results['by_invalid_chars']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法 return False, results diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py index 423a4071..89d44878 100644 --- a/magic_pdf/filter/pdf_meta_scan.py +++ b/magic_pdf/filter/pdf_meta_scan.py @@ -12,12 +12,13 @@ from magic_pdf.libs.drop_reason import DropReason from magic_pdf.libs.language import detect_lang +from magic_pdf.libs.pdf_check import detect_invalid_chars scan_max_page = 50 junk_limit_min = 10 -def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts): +def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts): max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in result] page_area = int(page_width_pts) * int(page_height_pts) @@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6] return max_image_area_per_page + def process_image(page, junk_img_bojids=[]): - page_result = []# 存每个页面里的多张图四元组信息 + page_result = [] # 存每个页面里的多张图四元组信息 items = page.get_images() dedup = set() for img in items: # 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是 - img_bojid = img[0]# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等 - if img_bojid in junk_img_bojids:# 如果是垃圾图像,就跳过 + img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等 + if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过 continue recs = page.get_image_rects(img, transform=True) if recs: @@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]): dedup.add((x0, y0, x1, y1, img_bojid)) page_result.append([x0, y0, x1, y1, img_bojid]) return page_result + + def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: """ 返回每个页面里的图片的四元组,每个页面多个图片。 @@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images()) # 找出出现次数超过 len(doc) 半数的 img_bojid - junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免 + junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免 junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit] @@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: result.append(page_result) for item in result: if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版 - if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版,就把junklist置空并break + if max(imgs_len_list) == min(imgs_len_list) and max( + imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break junk_img_bojids = [] - else:# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist + else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist pass break_loop = True break @@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: # 检查前80%的元素是否都相等 if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min: - # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist - # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min: + # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist + # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min: #前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts) if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空 junk_img_bojids = [] - else:# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist + else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist pass - else:# 每页图片数量不一致,需要清掉junklist全量跑前50页图片 + else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片 junk_img_bojids = [] #正式进入取前50页图片的信息流程 @@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document): median_width = page_width_list[len(page_width_list) // 2] median_height = page_height_list[len(page_height_list) // 2] - return median_width, median_height @@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document): return text_len_lst + def get_pdf_text_layout_per_page(doc: fitz.Document): """ 根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。 @@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document): # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}") return text_layout_list + '''定义一个自定义异常用来抛出单页svg太多的pdf''' + + class PageSvgsTooManyError(Exception): def __init__(self, message="Page SVGs are too many"): self.message = message super().__init__(self.message) + + def get_svgs_per_page(doc: fitz.Document): svgs_len_list = [] for page_id, page in enumerate(doc): @@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document): # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}") return svgs_len_list + def get_imgs_per_page(doc: fitz.Document): imgs_len_list = [] for page_id, page in enumerate(doc): @@ -287,6 +298,13 @@ def get_language(doc: fitz.Document): return language +def check_invalid_chars(pdf_bytes): + """ + 乱码检测 + """ + return detect_invalid_chars(pdf_bytes) + + def pdf_meta_scan(pdf_bytes: bytes): """ :param s3_pdf_path: @@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes): # logger.info(f"text_layout_per_page: {text_layout_per_page}") text_language = get_language(doc) # logger.info(f"text_language: {text_language}") - + invalid_chars = check_invalid_chars(pdf_bytes) + # logger.info(f"invalid_chars: {invalid_chars}") # 最后输出一条json res = { @@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes): # "svgs_per_page": svgs_per_page, "imgs_per_page": imgs_per_page, # 增加每页img数量list "junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list + "invalid_chars": invalid_chars, "metadata": doc.metadata } # logger.info(json.dumps(res, ensure_ascii=False)) @@ -365,4 +385,4 @@ def main(s3_pdf_path: str, s3_profile: str): # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","") # doc = fitz.open("pdf", file_content) # text_layout_lst = get_pdf_text_layout_per_page(doc) - # print(text_layout_lst) \ No newline at end of file + # print(text_layout_lst) diff --git a/magic_pdf/libs/pdf_check.py b/magic_pdf/libs/pdf_check.py new file mode 100644 index 00000000..0db71a34 --- /dev/null +++ b/magic_pdf/libs/pdf_check.py @@ -0,0 +1,59 @@ +from io import BytesIO +import re +import fitz +import numpy as np +from loguru import logger +from pdfminer.high_level import extract_text + + +def calculate_sample_count(total_page: int, sample_ratio=0.1): + """ + 根据总页数和采样率计算采样页面的数量。 + """ + select_page_cnt = int(total_page * sample_ratio) + if select_page_cnt < 5: + select_page_cnt = min(10, total_page) + elif select_page_cnt > 10: + select_page_cnt = 10 + return select_page_cnt + + +def extract_pages(src_pdf_bytes: bytes): + pdf_docs = fitz.open("pdf", src_pdf_bytes) + total_page = len(pdf_docs) + if total_page == 0: + # 如果PDF没有页面,直接返回空文档 + logger.warning("PDF is empty, return empty document") + return fitz.Document() + select_page_cnt = calculate_sample_count(total_page) + + page_num = np.random.choice(total_page, select_page_cnt, replace=False) + sample_docs = fitz.Document() + try: + for index in page_num: + sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index)) + except Exception as e: + logger.exception(e) + return sample_docs + + +def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: + """" + 检测PDF中是否包含非法字符 + """ + '''需要使用''' + sample_docs = extract_pages(src_pdf_bytes) + sample_pdf_bytes = sample_docs.tobytes() + sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) + text = extract_text(sample_pdf_file_like_object) + # logger.info(text) + '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' + cid_pattern = re.compile(r'\(cid:\d+\)') + matches = cid_pattern.findall(text) + cid_count = len(matches) + text_len = len(text) + logger.info(f"cid_count: {cid_count}, text_len: {text_len}") + if cid_count > 10: + return False # 乱码文档 + else: + return True # 正常文档 diff --git a/magic_pdf/pipe/AbsPipe.py b/magic_pdf/pipe/AbsPipe.py index 4329b5cf..5f584f58 100644 --- a/magic_pdf/pipe/AbsPipe.py +++ b/magic_pdf/pipe/AbsPipe.py @@ -83,6 +83,7 @@ def classify(pdf_bytes: bytes) -> str: pdf_meta["text_len_per_page"], pdf_meta["imgs_per_page"], pdf_meta["text_layout_per_page"], + pdf_meta["invalid_chars"], ) if is_text_pdf: return AbsPipe.PIP_TXT diff --git a/magic_pdf/user_api.py b/magic_pdf/user_api.py index b988daaa..7f51a5ee 100644 --- a/magic_pdf/user_api.py +++ b/magic_pdf/user_api.py @@ -86,45 +86,46 @@ def parse_pdf(method): return None pdf_info_dict = parse_pdf(parse_pdf_by_txt) - text_all = "" - for page_dict in pdf_info_dict['pdf_info']: - for para_block in page_dict['para_blocks']: - if para_block['type'] in ['title', 'text']: - for line in para_block['lines']: - for span in line['spans']: - text_all += span['content'] - - def calculate_not_common_character_rate(text): - garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]') - # 计算乱码字符的数量 - garbage_count = len(garbage_regex.findall(text)) - total = len(text) - if total == 0: - return 0 # 避免除以零的错误 - return garbage_count / total - - def calculate_not_printable_rate(text): - printable_text = "" - for c in text: - if c.isprintable(): - printable_text += c - printable_total = len(printable_text) - total = len(text) - if total == 0: - return 0 # 避免除以零的错误 - return (total - printable_total) / total - - not_common_character_rate = calculate_not_common_character_rate(text_all) - not_printable_rate = calculate_not_printable_rate(text_all) - pdf_info_dict["_not_common_character_rate"] = not_common_character_rate - pdf_info_dict["_not_printable_rate"] = not_printable_rate - logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}") + # text_all = "" + # for page_dict in pdf_info_dict['pdf_info']: + # for para_block in page_dict['para_blocks']: + # if para_block['type'] in ['title', 'text']: + # for line in para_block['lines']: + # for span in line['spans']: + # text_all += span['content'] + + # def calculate_not_common_character_rate(text): + # garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]') + # # 计算乱码字符的数量 + # garbage_count = len(garbage_regex.findall(text)) + # total = len(text) + # if total == 0: + # return 0 # 避免除以零的错误 + # return garbage_count / total + # + # def calculate_not_printable_rate(text): + # printable_text = "" + # for c in text: + # if c.isprintable(): + # printable_text += c + # printable_total = len(printable_text) + # total = len(text) + # if total == 0: + # return 0 # 避免除以零的错误 + # return (total - printable_total) / total + # + # not_common_character_rate = calculate_not_common_character_rate(text_all) + # not_printable_rate = calculate_not_printable_rate(text_all) + # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate + # pdf_info_dict["_not_printable_rate"] = not_printable_rate + # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}") + '''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理''' # not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好 if (pdf_info_dict is None - or pdf_info_dict.get("_need_drop", False) - or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02 + or pdf_info_dict.get("_need_drop", False) + # or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02 ): - logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr") + logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr") if input_model_is_empty: pdf_models = doc_analyze(pdf_bytes, ocr=True) pdf_info_dict = parse_pdf(parse_pdf_by_ocr) diff --git a/requirements.txt b/requirements.txt index f194ed10..d05b93af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,5 @@ wordninja>=2.0.0 scikit-learn>=1.0.2 nltk==3.8.1 s3pathlib>=2.1.1 -paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl \ No newline at end of file +paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl +pdfminer.six>=20231228 \ No newline at end of file