Skip to content

Commit

Permalink
Updates for v0.0.14
Browse files Browse the repository at this point in the history
Various improvements:
* "words" output now includes text from inside tables
* coherence between vertically neighbored text fragment blocks increased
* new parameter for activating a progress bar.
  • Loading branch information
JorjMcKie committed Sep 4, 2024
1 parent f626f2a commit 5a11865
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 24 deletions.
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown

__version__ = "0.0.13"
__version__ = "0.0.14"
version = __version__
version_tuple = tuple(map(int, version.split(".")))

Expand Down
2 changes: 2 additions & 0 deletions pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ def sanitize_spans(line):
spans = [] # all spans in TextPage here
for bno, b in enumerate(blocks): # the numbered blocks
for lno, line in enumerate(b["lines"]): # the numbered lines
if abs(1-line["dir"][0]) > 1e-3: # only accept horizontal text
continue
for sno, s in enumerate(line["spans"]): # the numered spans
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
mpoint = (sbbox.tl + sbbox.br) / 2 # middle point
Expand Down
8 changes: 5 additions & 3 deletions pymupdf4llm/pymupdf4llm/helpers/multi_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,11 @@ def can_extend(temp, bb, bboxlist, vert_bboxes):
def join_rects_phase1(bboxes):
"""Postprocess identified text blocks, phase 1.
Joins any rectangles that "touch" each other. This means that
their intersection is valid (but may be empty).
Joins any rectangles that "touch" each other.
This means that their intersection is valid (but may be empty).
To prefer vertical joins, we will ignore small horizontal gaps.
"""
delta=(0,-3,0,3) # allow thid gap above and below
prects = bboxes[:]
new_rects = []
while prects:
Expand All @@ -163,7 +165,7 @@ def join_rects_phase1(bboxes):
while repeat:
repeat = False
for i in range(len(prects) - 1, 0, -1):
if (prect0 & prects[i]).is_valid:
if ((prect0+delta) & (prects[i]+delta)).is_valid:
prect0 |= prects[i]
del prects[i]
repeat = True
Expand Down
48 changes: 36 additions & 12 deletions pymupdf4llm/pymupdf4llm/helpers/progress.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
"""
This script defines a text-based progress bar to allow watching the advancement
of Markdown conversion of document pages.
Dependencies
-------------
None
Copyright and License
----------------------
Copyright 2024 Artifex Software, Inc.
License GNU Affero GPL 3.0
"""

import sys
from typing import List, Any

Expand All @@ -15,11 +29,9 @@ def __init__(self, items: List[Any], progress_width: int = 40):
self._increment = self._progress_width / self._len if self._len else 1

# Init progress bar
sys.stdout.write("[%s] (0/%d)" %
(" " * self._progress_width, self._len))
sys.stdout.write("[%s] (0/%d)" % (" " * self._progress_width, self._len))
sys.stdout.flush()
sys.stdout.write(
"\b" * (self._progress_width + len(str(self._len)) + 6))
sys.stdout.write("\b" * (self._progress_width + len(str(self._len)) + 6))

def __iter__(self):
return self
Expand All @@ -45,17 +57,29 @@ def __next__(self):
# Update the numerical progress
padded_index = str(self._current_index).rjust(self._len_digits)
progress_info = f" ({padded_index}/{self._len})"
sys.stdout.write(
"\b" * (self._progress_width + len(progress_info) + 1))
sys.stdout.write("\b" * (self._progress_width + len(progress_info) + 1))
sys.stdout.write("[")
sys.stdout.write("=" * int(self._current_index *
self._progress_width / self._len))
sys.stdout.write(" " * (self._progress_width -
int(self._current_index * self._progress_width / self._len)))
sys.stdout.write(
"=" * int(self._current_index * self._progress_width / self._len)
)
sys.stdout.write(
" "
* (
self._progress_width
- int(self._current_index * self._progress_width / self._len)
)
)
sys.stdout.write("]" + progress_info)
sys.stdout.flush()
sys.stdout.write("\b" * (self._progress_width - int(self._current_index * self._progress_width / self._len)
+ len(progress_info) + 1))
sys.stdout.write(
"\b"
* (
self._progress_width
- int(self._current_index * self._progress_width / self._len)
+ len(progress_info)
+ 1
)
)

return result

Expand Down
49 changes: 42 additions & 7 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
Text will be sorted in Western reading order. Any table will be included in
the text in markdwn format as well.
Dependencies
-------------
PyMuPDF v1.24.2 or later
PyMuPDF v1.24.3 or later
Copyright and License
----------------------
Expand Down Expand Up @@ -247,6 +247,8 @@ def to_markdown(
page_height: (float) assumption if page layout is variable.
table_strategy: choose table detection strategy
graphics_limit: (int) ignore page with too many vector graphics.
ignore_code: (bool) suppress extra formatting for mono-space fonts
extract_words: (bool) include "words"-like output in page chunks
show_progress: (bool) print progress as each page is processed.
"""
Expand Down Expand Up @@ -403,6 +405,13 @@ def write_text(
key=lambda j: (j[1].y1, j[1].x0),
):
out_string += "\n" + tabs[i].to_markdown(clean=False) + "\n"
if EXTRACT_WORDS: # determine raw line rects within this table
line_rects.extend(
[
pymupdf.Rect(rl[0])
for rl in get_raw_lines(textpage, clip=tab_rects[i])
]
)
del tab_rects[i]

# ------------------------------------------------------------
Expand Down Expand Up @@ -548,7 +557,7 @@ def intersects_rects(rect, rect_list):
return i
return 0

def output_tables(tabs, text_rect, tab_rects):
def output_tables(tabs, text_rect, tab_rects, line_rects, textpage):
"""Output tables above a text rectangle."""
this_md = "" # markdown string for table content
if text_rect is not None: # select tables above the text block
Expand All @@ -557,6 +566,13 @@ def output_tables(tabs, text_rect, tab_rects):
key=lambda j: (j[1].y1, j[1].x0),
):
this_md += tabs[i].to_markdown(clean=False)
if EXTRACT_WORDS: # determine raw line rects within this table
line_rects.extend(
[
pymupdf.Rect(rl[0])
for rl in get_raw_lines(textpage, clip=tab_rects[i])
]
)
del tab_rects[i] # do not touch this table twice

else: # output all remaining table
Expand All @@ -565,6 +581,13 @@ def output_tables(tabs, text_rect, tab_rects):
key=lambda j: (j[1].y1, j[1].x0),
):
this_md += tabs[i].to_markdown(clean=False)
if EXTRACT_WORDS: # determine raw line rects within this table
line_rects.extend(
[
pymupdf.Rect(rl[0])
for rl in get_raw_lines(textpage, clip=tab_rects[i])
]
)
del tab_rects[i] # do not touch this table twice
return this_md

Expand Down Expand Up @@ -748,7 +771,7 @@ def get_page_output(doc, pno, margins, textflags):
"""
for text_rect in text_rects:
# output tables above this block of text
md_string += output_tables(tabs, text_rect, tab_rects)
md_string += output_tables(tabs, text_rect, tab_rects, line_rects, textpage)
md_string += output_images(
page, textpage, text_rect, vg_clusters, line_rects
)
Expand All @@ -768,24 +791,36 @@ def get_page_output(doc, pno, margins, textflags):

md_string = md_string.replace(" ,", ",").replace("-\n", "")
# write any remaining tables and images
md_string += output_tables(tabs, None, tab_rects)
md_string += output_tables(tabs, None, tab_rects, line_rects, textpage)
md_string += output_images(page, textpage, None, vg_clusters, line_rects)
md_string += "\n-----\n\n"
while md_string.startswith("\n"):
md_string = md_string[1:]
md_string = md_string.replace(chr(0), chr(0xFFFD))

if EXTRACT_WORDS is True:
# output words in sequence compliant with Markdown text
rawwords = textpage.extractWORDS()
words = []
for lrect in line_rects:
lwords = []
for w in rawwords:
wrect = pymupdf.Rect(w[:4])
if wrect in lrect:
wrect.y0 = lrect.y0
wrect.y1 = lrect.y1
wrect.y0 = lrect.y0 # set upper coord to line
wrect.y1 = lrect.y1 # set lower coord to line
lwords.append(list(wrect) + list(w[4:]))
# append sorted words of this line
words.extend(sorted(lwords, key=lambda w: w[0]))

# remove word duplicates without spoiling the sequence
# duplicates may occur for multiple reasons
nwords = [] # words w/o duplicates
for w in words:
if w not in nwords:
nwords.append(w)
words = nwords

else:
words = []
return md_string, images, tables, graphics, words
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setuptools.setup(
name="pymupdf4llm",
version="0.0.13",
version="0.0.14",
author="Artifex",
author_email="[email protected]",
description="PyMuPDF Utilities for LLM/RAG",
Expand Down

0 comments on commit 5a11865

Please sign in to comment.