diff --git a/docs/src/changes.rst b/docs/src/changes.rst new file mode 100644 index 00000000..17091a46 --- /dev/null +++ b/docs/src/changes.rst @@ -0,0 +1,103 @@ +.. include:: header.rst + + +Change Log +=========================================================================== + +Changes in version 0.0.11 +-------------------------- + +Fixes: +~~~~~~~ + +* `90 `_ "'Quad' object has no attribute 'tl'" +* `88 `_ "Bug in is_significant function" + + +Improvements: +~~~~~~~~~~~~~~ +* Extended the list of known bullet point characters. + + +Changes in version 0.0.10 +-------------------------- + +Fixes: +~~~~~~~ + +* `73 `_ "bug in to_markdown internal function" +* `74 `_ "minimum area for images & vector graphics" +* `75 `_ "Poor Markdown Generation for Particular PDF" +* `76 `_ "suggestion on useful api parameters" + + +Improvements: +~~~~~~~~~~~~~~ +* Improved recognition of "insignificant" vector graphics. Graphics like text highlights or borders will be ignored. +* The format of saved images can now be controlled via new parameter `image_format`. +* Images can be stored in a specific folder via the new parameter `image_path`. +* Images are **not stored if contained** in another image on same page. +* Images are **not stored if too small:** if width or height are less than 5% of corresponding page dimension. +* All text is always written. If `write_images=True`, text on images / graphics can be suppressed by setting `force_text=False`. + + +Changes in version 0.0.9 +-------------------------- + +Fixes: +~~~~~~~ + +* `71 `_ "Unexpected results in pymupdf4llm but pymupdf works" +* `68 `_ "Issue with text extraction near footer of page" + + +Improvements: +~~~~~~~~~~~~~~ +* Improved identification of scattered text span particles. This should address most issues with out-of-sequence situations. +* We now correctly process rotated pages (see issue #68). + + +Changes in version 0.0.8 +-------------------------- + +Fixes: +~~~~~~~ + +* `65 `_ Fix typo in `pymupdf_rag.py`. + + +Changes in version 0.0.7 +-------------------------- + +Fixes: +~~~~~~~ + +* `54 `_ "Mistakes in orchestrating sentences". Additional fix: text extraction no longer uses the TEXT_DEHYPHNATE flag bit. + +Improvements: +~~~~~~~~~~~~~~~~ + +* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant: We now detect when "strokes" only exist in the neighborhood of the graphics boundary box border itself. This is quite often the case for code snippets. + + +Changes in version 0.0.6 +-------------------------- + +Fixes: +~~~~~~~ + +* `55 `_ "Bug in helpers/multi_column.py - IndexError: list index out of range" +* `54 `_ "Mistakes in orchestrating sentences" +* `52 `_ "Chunking of text files" +* Partial fix for `41 `_ / `40 `_. Improved page column detection, but still no silver bullet for overly complex page layouts. + +Improvements: +~~~~~~~~~~~~~~~~ + +* New parameter `dpi` to specify the resolution of images. +* New parameters `page_width` / `page_height` for easily processing reflowable documents (Text, Office, e-books). +* New parameter `graphics_limit` to avoid spending runtimes for value-less content. +* New parameter `table_strategy` to directly control the table detection strategy. + +.. include:: footer.rst + diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index 6b972113..333f77ca 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -69,7 +69,9 @@ def sanitize_spans(line): Returns: A list of sorted, and potentially cleaned-up spans """ - line.sort(key=lambda s: s["bbox"].x0) # sort left to right + # sort ascending horizontally + line.sort(key=lambda s: s["bbox"].x0) + # join spans, delete duplicates for i in range(len(line) - 1, 0, -1): # iterate back to front s0 = line[i - 1] s1 = line[i] @@ -78,13 +80,17 @@ def sanitize_spans(line): delta = s1["size"] * 0.1 if s0["bbox"].x1 + delta < s1["bbox"].x0: continue # all good: no joining neded + + # We need to join bbox and text of two consecutive spans + # On occasion, spans may also be duplicated. + if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]: + s0["text"] += s1["text"] s0["bbox"] |= s1["bbox"] # join boundary boxes - s0["text"] += s1["text"] # join the text del line[i] # delete the joined-in span line[i - 1] = s0 # update the span return line - if clip is None: # use TextPage if not provided + if clip is None: # use TextPage rect if not provided clip = textpage.rect # extract text blocks - if bbox is not empty blocks = [ @@ -126,10 +132,7 @@ def sanitize_spans(line): sbbox = s["bbox"] # this bbox sbbox0 = line[-1]["bbox"] # previous bbox # if any of top or bottom coordinates are close enough, join... - if ( - abs(sbbox.y1 - sbbox0.y1) <= y_delta - or abs(sbbox.y0 - sbbox0.y0) <= y_delta - ): + if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta: line.append(s) # append to this line lrect |= sbbox # extend line rectangle continue @@ -150,9 +153,7 @@ def sanitize_spans(line): return nlines -def get_text_lines( - page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False -): +def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False): """Extract text by line keeping natural reading sequence. Notes: diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index d60d0f7b..05413d86 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -40,15 +40,15 @@ if fitz.pymupdf_version_tuple < (1, 24, 2): raise NotImplementedError("PyMuPDF version 1.24.2 or later is needed.") -bullet = ( +bullet = [ "- ", "* ", chr(0xF0A7), chr(0xF0B7), chr(0xB7), chr(8226), - chr(9679), -) +] + list(map(chr, range(9642, 9680))) + GRAPHICS_TEXT = "\n![](%s)\n" @@ -193,7 +193,7 @@ def is_significant(box, paths): for itm in p["items"]: if itm[0] in ("l", "c"): # line or curve points.extend(itm[1:]) # append all the points - elif itm[0] == "q": # quad + elif itm[0] == "qu": # quad q = itm[1] # follow corners anti-clockwise points.extend([q.ul, q.ll, q.lr, q.ur, q.ul])