Updates for v0.0.14

Various improvements: * "words" output now includes text from inside tables * coherence between vertically neighbored text fragment blocks increased * new parameter for activating a progress bar.
pymupdf · Sep 4, 2024 · 5a11865 · 5a11865
1 parent f626f2a
commit 5a11865
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 24 deletions.
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.13"
+__version__ = "0.0.14"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 

diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -98,6 +98,8 @@ def sanitize_spans(line):
     spans = []  # all spans in TextPage here
     for bno, b in enumerate(blocks):  # the numbered blocks
         for lno, line in enumerate(b["lines"]):  # the numbered lines
+            if abs(1-line["dir"][0]) > 1e-3:  # only accept horizontal text
+                continue
             for sno, s in enumerate(line["spans"]):  # the numered spans
                 sbbox = pymupdf.Rect(s["bbox"])  # span bbox as a Rect
                 mpoint = (sbbox.tl + sbbox.br) / 2  # middle point

diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
@@ -152,9 +152,11 @@ def can_extend(temp, bb, bboxlist, vert_bboxes):
     def join_rects_phase1(bboxes):
         """Postprocess identified text blocks, phase 1.
 
-        Joins any rectangles that "touch" each other. This means that
-        their intersection is valid (but may be empty).
+        Joins any rectangles that "touch" each other.
+        This means that their intersection is valid (but may be empty).
+        To prefer vertical joins, we will ignore small horizontal gaps.
         """
+        delta=(0,-3,0,3)  # allow thid gap above and below
         prects = bboxes[:]
         new_rects = []
         while prects:
@@ -163,7 +165,7 @@ def join_rects_phase1(bboxes):
             while repeat:
                 repeat = False
                 for i in range(len(prects) - 1, 0, -1):
-                    if (prect0 & prects[i]).is_valid:
+                    if ((prect0+delta) & (prects[i]+delta)).is_valid:
                         prect0 |= prects[i]
                         del prects[i]
                         repeat = True

diff --git a/pymupdf4llm/pymupdf4llm/helpers/progress.py b/pymupdf4llm/pymupdf4llm/helpers/progress.py
@@ -1,3 +1,17 @@
+"""
+This script defines a text-based progress bar to allow watching the advancement
+of Markdown conversion of document pages.
+ 
+Dependencies
+-------------
+None
+
+Copyright and License
+----------------------
+Copyright 2024 Artifex Software, Inc.
+License GNU Affero GPL 3.0
+"""
+
 import sys
 from typing import List, Any
 
@@ -15,11 +29,9 @@ def __init__(self, items: List[Any], progress_width: int = 40):
         self._increment = self._progress_width / self._len if self._len else 1
 
         # Init progress bar
-        sys.stdout.write("[%s] (0/%d)" %
-                         (" " * self._progress_width, self._len))
+        sys.stdout.write("[%s] (0/%d)" % (" " * self._progress_width, self._len))
         sys.stdout.flush()
-        sys.stdout.write(
-            "\b" * (self._progress_width + len(str(self._len)) + 6))
+        sys.stdout.write("\b" * (self._progress_width + len(str(self._len)) + 6))
 
     def __iter__(self):
         return self
@@ -45,17 +57,29 @@ def __next__(self):
         # Update the numerical progress
         padded_index = str(self._current_index).rjust(self._len_digits)
         progress_info = f" ({padded_index}/{self._len})"
-        sys.stdout.write(
-            "\b" * (self._progress_width + len(progress_info) + 1))
+        sys.stdout.write("\b" * (self._progress_width + len(progress_info) + 1))
         sys.stdout.write("[")
-        sys.stdout.write("=" * int(self._current_index *
-                         self._progress_width / self._len))
-        sys.stdout.write(" " * (self._progress_width -
-                         int(self._current_index * self._progress_width / self._len)))
+        sys.stdout.write(
+            "=" * int(self._current_index * self._progress_width / self._len)
+        )
+        sys.stdout.write(
+            " "
+            * (
+                self._progress_width
+                - int(self._current_index * self._progress_width / self._len)
+            )
+        )
         sys.stdout.write("]" + progress_info)
         sys.stdout.flush()
-        sys.stdout.write("\b" * (self._progress_width - int(self._current_index * self._progress_width / self._len)
-                         + len(progress_info) + 1))
+        sys.stdout.write(
+            "\b"
+            * (
+                self._progress_width
+                - int(self._current_index * self._progress_width / self._len)
+                + len(progress_info)
+                + 1
+            )
+        )
 
         return result
 

diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -15,10 +15,10 @@
 
 Text will be sorted in Western reading order. Any table will be included in
 the text in markdwn format as well.
-
+ 
 Dependencies
 -------------
-PyMuPDF v1.24.2 or later
+PyMuPDF v1.24.3 or later
 
 Copyright and License
 ----------------------
@@ -247,6 +247,8 @@ def to_markdown(
         page_height: (float) assumption if page layout is variable.
         table_strategy: choose table detection strategy
         graphics_limit: (int) ignore page with too many vector graphics.
+        ignore_code: (bool) suppress extra formatting for mono-space fonts
+        extract_words: (bool) include "words"-like output in page chunks
         show_progress: (bool) print progress as each page is processed.
 
     """
@@ -403,6 +405,13 @@ def write_text(
                 key=lambda j: (j[1].y1, j[1].x0),
             ):
                 out_string += "\n" + tabs[i].to_markdown(clean=False) + "\n"
+                if EXTRACT_WORDS:  # determine raw line rects within this table
+                    line_rects.extend(
+                        [
+                            pymupdf.Rect(rl[0])
+                            for rl in get_raw_lines(textpage, clip=tab_rects[i])
+                        ]
+                    )
                 del tab_rects[i]
 
             # ------------------------------------------------------------
@@ -548,7 +557,7 @@ def intersects_rects(rect, rect_list):
                 return i
         return 0
 
-    def output_tables(tabs, text_rect, tab_rects):
+    def output_tables(tabs, text_rect, tab_rects, line_rects, textpage):
         """Output tables above a text rectangle."""
         this_md = ""  # markdown string for table content
         if text_rect is not None:  # select tables above the text block
@@ -557,6 +566,13 @@ def output_tables(tabs, text_rect, tab_rects):
                 key=lambda j: (j[1].y1, j[1].x0),
             ):
                 this_md += tabs[i].to_markdown(clean=False)
+                if EXTRACT_WORDS:  # determine raw line rects within this table
+                    line_rects.extend(
+                        [
+                            pymupdf.Rect(rl[0])
+                            for rl in get_raw_lines(textpage, clip=tab_rects[i])
+                        ]
+                    )
                 del tab_rects[i]  # do not touch this table twice
 
         else:  # output all remaining table
@@ -565,6 +581,13 @@ def output_tables(tabs, text_rect, tab_rects):
                 key=lambda j: (j[1].y1, j[1].x0),
             ):
                 this_md += tabs[i].to_markdown(clean=False)
+                if EXTRACT_WORDS:  # determine raw line rects within this table
+                    line_rects.extend(
+                        [
+                            pymupdf.Rect(rl[0])
+                            for rl in get_raw_lines(textpage, clip=tab_rects[i])
+                        ]
+                    )
                 del tab_rects[i]  # do not touch this table twice
         return this_md
 
@@ -748,7 +771,7 @@ def get_page_output(doc, pno, margins, textflags):
         """
         for text_rect in text_rects:
             # output tables above this block of text
-            md_string += output_tables(tabs, text_rect, tab_rects)
+            md_string += output_tables(tabs, text_rect, tab_rects, line_rects, textpage)
             md_string += output_images(
                 page, textpage, text_rect, vg_clusters, line_rects
             )
@@ -768,24 +791,36 @@ def get_page_output(doc, pno, margins, textflags):
 
         md_string = md_string.replace(" ,", ",").replace("-\n", "")
         # write any remaining tables and images
-        md_string += output_tables(tabs, None, tab_rects)
+        md_string += output_tables(tabs, None, tab_rects, line_rects, textpage)
         md_string += output_images(page, textpage, None, vg_clusters, line_rects)
         md_string += "\n-----\n\n"
         while md_string.startswith("\n"):
             md_string = md_string[1:]
         md_string = md_string.replace(chr(0), chr(0xFFFD))
+
         if EXTRACT_WORDS is True:
+            # output words in sequence compliant with Markdown text
             rawwords = textpage.extractWORDS()
             words = []
             for lrect in line_rects:
                 lwords = []
                 for w in rawwords:
                     wrect = pymupdf.Rect(w[:4])
                     if wrect in lrect:
-                        wrect.y0 = lrect.y0
-                        wrect.y1 = lrect.y1
+                        wrect.y0 = lrect.y0  # set upper coord to line
+                        wrect.y1 = lrect.y1  # set lower coord to line
                         lwords.append(list(wrect) + list(w[4:]))
+                # append sorted words of this line
                 words.extend(sorted(lwords, key=lambda w: w[0]))
+
+            # remove word duplicates without spoiling the sequence
+            # duplicates may occur for multiple reasons
+            nwords = []  # words w/o duplicates
+            for w in words:
+                if w not in nwords:
+                    nwords.append(w)
+            words = nwords
+
         else:
             words = []
         return md_string, images, tables, graphics, words

diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
@@ -17,7 +17,7 @@
 
 setuptools.setup(
     name="pymupdf4llm",
-    version="0.0.13",
+    version="0.0.14",
     author="Artifex",
     author_email="[email protected]",
     description="PyMuPDF Utilities for LLM/RAG",