From 5a11865e523ffb303ab9d404335f929974d144c2 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Wed, 4 Sep 2024 16:05:40 -0400 Subject: [PATCH] Updates for v0.0.14 Various improvements: * "words" output now includes text from inside tables * coherence between vertically neighbored text fragment blocks increased * new parameter for activating a progress bar. --- pymupdf4llm/pymupdf4llm/__init__.py | 2 +- .../pymupdf4llm/helpers/get_text_lines.py | 2 + .../pymupdf4llm/helpers/multi_column.py | 8 +-- pymupdf4llm/pymupdf4llm/helpers/progress.py | 48 +++++++++++++----- .../pymupdf4llm/helpers/pymupdf_rag.py | 49 ++++++++++++++++--- pymupdf4llm/setup.py | 2 +- 6 files changed, 87 insertions(+), 24 deletions(-) diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index d5d34856..75f6156c 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.13" +__version__ = "0.0.14" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index 6aef431f..fb134b15 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -98,6 +98,8 @@ def sanitize_spans(line): spans = [] # all spans in TextPage here for bno, b in enumerate(blocks): # the numbered blocks for lno, line in enumerate(b["lines"]): # the numbered lines + if abs(1-line["dir"][0]) > 1e-3: # only accept horizontal text + continue for sno, s in enumerate(line["spans"]): # the numered spans sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect mpoint = (sbbox.tl + sbbox.br) / 2 # middle point diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py index b535271d..8580b892 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py +++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py @@ -152,9 +152,11 @@ def can_extend(temp, bb, bboxlist, vert_bboxes): def join_rects_phase1(bboxes): """Postprocess identified text blocks, phase 1. - Joins any rectangles that "touch" each other. This means that - their intersection is valid (but may be empty). + Joins any rectangles that "touch" each other. + This means that their intersection is valid (but may be empty). + To prefer vertical joins, we will ignore small horizontal gaps. """ + delta=(0,-3,0,3) # allow thid gap above and below prects = bboxes[:] new_rects = [] while prects: @@ -163,7 +165,7 @@ def join_rects_phase1(bboxes): while repeat: repeat = False for i in range(len(prects) - 1, 0, -1): - if (prect0 & prects[i]).is_valid: + if ((prect0+delta) & (prects[i]+delta)).is_valid: prect0 |= prects[i] del prects[i] repeat = True diff --git a/pymupdf4llm/pymupdf4llm/helpers/progress.py b/pymupdf4llm/pymupdf4llm/helpers/progress.py index 33c3ff35..db671de6 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/progress.py +++ b/pymupdf4llm/pymupdf4llm/helpers/progress.py @@ -1,3 +1,17 @@ +""" +This script defines a text-based progress bar to allow watching the advancement +of Markdown conversion of document pages. + +Dependencies +------------- +None + +Copyright and License +---------------------- +Copyright 2024 Artifex Software, Inc. +License GNU Affero GPL 3.0 +""" + import sys from typing import List, Any @@ -15,11 +29,9 @@ def __init__(self, items: List[Any], progress_width: int = 40): self._increment = self._progress_width / self._len if self._len else 1 # Init progress bar - sys.stdout.write("[%s] (0/%d)" % - (" " * self._progress_width, self._len)) + sys.stdout.write("[%s] (0/%d)" % (" " * self._progress_width, self._len)) sys.stdout.flush() - sys.stdout.write( - "\b" * (self._progress_width + len(str(self._len)) + 6)) + sys.stdout.write("\b" * (self._progress_width + len(str(self._len)) + 6)) def __iter__(self): return self @@ -45,17 +57,29 @@ def __next__(self): # Update the numerical progress padded_index = str(self._current_index).rjust(self._len_digits) progress_info = f" ({padded_index}/{self._len})" - sys.stdout.write( - "\b" * (self._progress_width + len(progress_info) + 1)) + sys.stdout.write("\b" * (self._progress_width + len(progress_info) + 1)) sys.stdout.write("[") - sys.stdout.write("=" * int(self._current_index * - self._progress_width / self._len)) - sys.stdout.write(" " * (self._progress_width - - int(self._current_index * self._progress_width / self._len))) + sys.stdout.write( + "=" * int(self._current_index * self._progress_width / self._len) + ) + sys.stdout.write( + " " + * ( + self._progress_width + - int(self._current_index * self._progress_width / self._len) + ) + ) sys.stdout.write("]" + progress_info) sys.stdout.flush() - sys.stdout.write("\b" * (self._progress_width - int(self._current_index * self._progress_width / self._len) - + len(progress_info) + 1)) + sys.stdout.write( + "\b" + * ( + self._progress_width + - int(self._current_index * self._progress_width / self._len) + + len(progress_info) + + 1 + ) + ) return result diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index d3ca3433..03e12fc6 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -15,10 +15,10 @@ Text will be sorted in Western reading order. Any table will be included in the text in markdwn format as well. - + Dependencies ------------- -PyMuPDF v1.24.2 or later +PyMuPDF v1.24.3 or later Copyright and License ---------------------- @@ -247,6 +247,8 @@ def to_markdown( page_height: (float) assumption if page layout is variable. table_strategy: choose table detection strategy graphics_limit: (int) ignore page with too many vector graphics. + ignore_code: (bool) suppress extra formatting for mono-space fonts + extract_words: (bool) include "words"-like output in page chunks show_progress: (bool) print progress as each page is processed. """ @@ -403,6 +405,13 @@ def write_text( key=lambda j: (j[1].y1, j[1].x0), ): out_string += "\n" + tabs[i].to_markdown(clean=False) + "\n" + if EXTRACT_WORDS: # determine raw line rects within this table + line_rects.extend( + [ + pymupdf.Rect(rl[0]) + for rl in get_raw_lines(textpage, clip=tab_rects[i]) + ] + ) del tab_rects[i] # ------------------------------------------------------------ @@ -548,7 +557,7 @@ def intersects_rects(rect, rect_list): return i return 0 - def output_tables(tabs, text_rect, tab_rects): + def output_tables(tabs, text_rect, tab_rects, line_rects, textpage): """Output tables above a text rectangle.""" this_md = "" # markdown string for table content if text_rect is not None: # select tables above the text block @@ -557,6 +566,13 @@ def output_tables(tabs, text_rect, tab_rects): key=lambda j: (j[1].y1, j[1].x0), ): this_md += tabs[i].to_markdown(clean=False) + if EXTRACT_WORDS: # determine raw line rects within this table + line_rects.extend( + [ + pymupdf.Rect(rl[0]) + for rl in get_raw_lines(textpage, clip=tab_rects[i]) + ] + ) del tab_rects[i] # do not touch this table twice else: # output all remaining table @@ -565,6 +581,13 @@ def output_tables(tabs, text_rect, tab_rects): key=lambda j: (j[1].y1, j[1].x0), ): this_md += tabs[i].to_markdown(clean=False) + if EXTRACT_WORDS: # determine raw line rects within this table + line_rects.extend( + [ + pymupdf.Rect(rl[0]) + for rl in get_raw_lines(textpage, clip=tab_rects[i]) + ] + ) del tab_rects[i] # do not touch this table twice return this_md @@ -748,7 +771,7 @@ def get_page_output(doc, pno, margins, textflags): """ for text_rect in text_rects: # output tables above this block of text - md_string += output_tables(tabs, text_rect, tab_rects) + md_string += output_tables(tabs, text_rect, tab_rects, line_rects, textpage) md_string += output_images( page, textpage, text_rect, vg_clusters, line_rects ) @@ -768,13 +791,15 @@ def get_page_output(doc, pno, margins, textflags): md_string = md_string.replace(" ,", ",").replace("-\n", "") # write any remaining tables and images - md_string += output_tables(tabs, None, tab_rects) + md_string += output_tables(tabs, None, tab_rects, line_rects, textpage) md_string += output_images(page, textpage, None, vg_clusters, line_rects) md_string += "\n-----\n\n" while md_string.startswith("\n"): md_string = md_string[1:] md_string = md_string.replace(chr(0), chr(0xFFFD)) + if EXTRACT_WORDS is True: + # output words in sequence compliant with Markdown text rawwords = textpage.extractWORDS() words = [] for lrect in line_rects: @@ -782,10 +807,20 @@ def get_page_output(doc, pno, margins, textflags): for w in rawwords: wrect = pymupdf.Rect(w[:4]) if wrect in lrect: - wrect.y0 = lrect.y0 - wrect.y1 = lrect.y1 + wrect.y0 = lrect.y0 # set upper coord to line + wrect.y1 = lrect.y1 # set lower coord to line lwords.append(list(wrect) + list(w[4:])) + # append sorted words of this line words.extend(sorted(lwords, key=lambda w: w[0])) + + # remove word duplicates without spoiling the sequence + # duplicates may occur for multiple reasons + nwords = [] # words w/o duplicates + for w in words: + if w not in nwords: + nwords.append(w) + words = nwords + else: words = [] return md_string, images, tables, graphics, words diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 9575dd1f..6b4fe33a 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="pymupdf4llm", - version="0.0.13", + version="0.0.14", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG",