Fix #2553

Previously, the output of plain text converted characters via fz_chartorune and "words", "blocks", "dict" and "rawdict" handled character conversions differently, using Python raw unicode decoding. A yet somewhat different behavior was used in page.get_textbox() - which is plain text extraction from within a rectangle independent from using a clip. This fix ensures that plain text extraction (including textbox) deliver the same output. This is checked via comparing the set of characters produced in each of the cases.
pymupdf · Oct 24, 2023 · ebc0361 · ebc0361
1 parent 6df96dc
commit ebc0361
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 81 deletions.
diff --git a/fitz/fitz.i b/fitz/fitz.i
@@ -11827,10 +11827,10 @@ struct TextPage {
                         fz_print_stext_page_as_xhtml(gctx, out, this_tpage, 0);
                         break;
                     default:
-                        JM_print_stext_page_as_text(gctx, out, this_tpage);
+                        JM_print_stext_page_as_text(gctx, res, this_tpage);
                         break;
                 }
-                text = JM_UnicodeFromBuffer(gctx, res);
+                text = JM_EscapeStrFromBuffer(gctx, res);
 
             }
             fz_always(gctx) {
@@ -11845,28 +11845,20 @@ struct TextPage {
 
 
         //----------------------------------------------------------------
-        // method extractRect()
+        // method extractTextbox()
         //----------------------------------------------------------------
+        FITZEXCEPTION(extractTextbox, !result)
         PyObject *extractTextbox(PyObject *rect)
         {
             fz_stext_page *this_tpage = (fz_stext_page *) $self;
             fz_rect area = JM_rect_from_py(rect);
             PyObject *rc = NULL;
-            char *found = NULL;
             fz_try(gctx) {
-                char *found = JM_copy_rectangle(gctx, this_tpage, area);
-                if (found) {
-                    rc = JM_UnicodeFromStr(found);
-                    JM_Free(found);
-                } else {
-                    rc = EMPTY_STRING;
-                }
+                rc = JM_copy_rectangle(gctx, this_tpage, area);
             }
             fz_catch(gctx) {
-                if (found) JM_Free(found);
-                return EMPTY_STRING;
+                return NULL;
             }
-
             return rc;
         }
 

diff --git a/fitz/helper-stext.i b/fitz/helper-stext.i
@@ -460,7 +460,7 @@ no_more_matches:;
 // character (which else leads to 2 new-lines).
 //-----------------------------------------------------------------------------
 void
-JM_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
+JM_print_stext_page_as_text(fz_context *ctx, fz_buffer *buff, fz_stext_page *page)
 {
     fz_stext_block *block;
     fz_stext_line *line;
@@ -480,14 +480,11 @@ JM_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page
                     if (fz_is_infinite_rect(rect) ||
                         JM_rects_overlap(rect, chbbox)) {
                         last_char = ch->c;
-                        n = fz_runetochar(utf, ch->c);
-                        for (i = 0; i < n; i++) {
-                            fz_write_byte(ctx, out, utf[i]);
-                        }
+                        JM_append_rune(ctx, buff, ch->c);
                     }
                 }
                 if (last_char != 10 && last_char > 0) {
-                    fz_write_string(ctx, out, "\n");
+                    fz_append_string(ctx, buff, "\n");
                 }
             }
         }
@@ -794,18 +791,17 @@ void JM_make_textpage_dict(fz_context *ctx, fz_stext_page *tp, PyObject *page_di
 
 
 //---------------------------------------------------------------------
-char *
+PyObject *
 JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area)
 {
 	fz_stext_block *block;
 	fz_stext_line *line;
 	fz_stext_char *ch;
 	fz_buffer *buffer;
-	unsigned char *s;
 	int need_new_line = 0;
-
-	buffer = fz_new_buffer(ctx, 1024);
+	PyObject *rc = NULL;
 	fz_try(ctx) {
+        buffer = fz_new_buffer(ctx, 1024);
 		for (block = page->first_block; block; block = block->next) {
 			if (block->type != FZ_STEXT_BLOCK_TEXT)
 				continue;
@@ -819,24 +815,27 @@ JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area)
 							fz_append_string(ctx, buffer, "\n");
 							need_new_line = 0;
 						}
-						fz_append_rune(ctx, buffer, ch->c < 32 ? FZ_REPLACEMENT_CHARACTER : ch->c);
+						JM_append_rune(ctx, buffer, ch->c);
 					}
 				}
 				if (line_had_text)
 					need_new_line = 1;
 			}
 		}
 		fz_terminate_buffer(ctx, buffer);
+        rc = JM_EscapeStrFromBuffer(ctx, buffer);
+        if (!rc) {
+            rc = EMPTY_STRING;
+            PyErr_Clear();
+        }
 	}
+    fz_always(ctx) {
+        fz_drop_buffer(ctx, buffer);
+        }
 	fz_catch(ctx) {
-		fz_drop_buffer(ctx, buffer);
 		fz_rethrow(ctx);
 	}
-
-
-	fz_buffer_extract(ctx, buffer, &s); /* take over the data */
-	fz_drop_buffer(ctx, buffer);
-	return (char*)s;
+	return rc;
 }
 //---------------------------------------------------------------------
 

diff --git a/src/__init__.py b/src/__init__.py
@@ -11903,8 +11903,8 @@ def _extractText(self, format_):
         elif format_ == 4:
             mupdf.fz_print_stext_page_as_xhtml(out, this_tpage, 0)
         else:
-            JM_print_stext_page_as_text(out, this_tpage)
-        text = JM_UnicodeFromBuffer(res)
+            JM_print_stext_page_as_text(res, this_tpage)
+        text = JM_EscapeStrFromBuffer(res)
         return text
 
     def _getNewBlockList(self, page_dict, raw):
@@ -12096,10 +12096,7 @@ def extractTextbox(self, rect):
         assert isinstance(this_tpage, mupdf.FzStextPage)
         area = JM_rect_from_py(rect)
         found = JM_copy_rectangle(this_tpage, area);
-        if (found):
-            rc = JM_UnicodeFromStr(found)
-        else:
-            rc = ''
+        rc = PyUnicode_DecodeRawUnicodeEscape(found)
         return rc
 
     def extractWORDS(self, delimiters=None):
@@ -13929,12 +13926,24 @@ def JM_annot_set_border( border, doc, annot_obj):
         mupdf.pdf_dict_put_int( obj, PDF_NAME('I'), nclouds)
 
 
+def make_escape(ch):
+    if ch == 92:
+        return "\\u005c"
+    elif 32 <= ch <= 127 or ch == 10:
+        return chr(ch)
+    elif 0xd800 <= ch <= 0xdfff:  # orphaned surrogate
+        return chr(0xfffd)
+    elif ch <= 0xffff:
+        return "\\u%04x" % ch
+    else:
+        return "\\U%08x" % ch
+
+
 def JM_append_rune(buff, ch):
     """
     APPEND non-ascii runes in unicode escape format to fz_buffer.
-    No need for special processing in pure Python.
     """
-    mupdf.fz_append_string(buff, chr(ch))
+    mupdf.fz_append_string(buff, make_escape(ch))
 
 
 def JM_append_word(lines, buff, wbbox, block_n, line_n, word_n):
@@ -14244,9 +14253,10 @@ def JM_compress_buffer(inbuffer):
     return buf;
 
 
+
 def JM_copy_rectangle(page, area):
     need_new_line = 0
-    buffer_ = mupdf.fz_new_buffer(1024)
+    buffer = io.StringIO()
     for block in page:
         if block.m_internal.type != mupdf.FZ_STEXT_BLOCK_TEXT:
             continue
@@ -14257,17 +14267,13 @@ def JM_copy_rectangle(page, area):
                 if JM_rects_overlap(area, r):
                     line_had_text = 1
                     if need_new_line:
-                        mupdf.fz_append_string(buffer_, "\n")
+                        buffer.write("\n")
                         need_new_line = 0
-                    mupdf.fz_append_rune(
-                            buffer_,
-                            FZ_REPLACEMENT_CHARACTER if ch.m_internal.c < 32 else ch.m_internal.c,
-                            )
+                    buffer.write(make_escape(ch.m_internal.c))
             if line_had_text:
                 need_new_line = 1
-    mupdf.fz_terminate_buffer(buffer_)
 
-    s = mupdf.fz_buffer_extract(buffer_)   # take over the data
+    s = buffer.getvalue()   # take over the data
     return s
 
 
@@ -16303,16 +16309,16 @@ def JM_point_from_py(p):
     return mupdf.FzPoint(x, y)
 
 
-def JM_print_stext_page_as_text(out, page):
+def JM_print_stext_page_as_text(res, page):
     '''
     Plain text output. An identical copy of fz_print_stext_page_as_text,
     but lines within a block are concatenated by space instead a new-line
     character (which else leads to 2 new-lines).
     '''
     if 1 and g_use_extra:
-        return extra.JM_print_stext_page_as_text( out, page)
+        return extra.JM_print_stext_page_as_text(res, page)
 
-    assert isinstance(out, mupdf.FzOutput)
+    assert isinstance(res, mupdf.FzBuffer)
     assert isinstance(page, mupdf.FzStextPage)
     rect = mupdf.FzRect(page.m_internal.mediabox)
     last_char = 0
@@ -16340,14 +16346,10 @@ def JM_print_stext_page_as_text(out, page):
                             ):
                         #raw += chr(ch.m_internal.c)
                         last_char = ch.m_internal.c
-                        utf = mupdf.fz_runetochar2(last_char)
                         #log( '{=last_char!r utf!r}')
-                        for c in utf:
-                            assert isinstance(c, int), f'{type(c)=} {c=}'
-                            assert 0 <= c < 256, f'{utf=} {c=}'
-                            mupdf.fz_write_byte(out, c)
+                        JM_append_rune(res, last_char)
                 if last_char != 10 and last_char > 0:
-                    mupdf.fz_write_string(out, "\n")
+                    mupdf.fz_append_string(res, "\n")
 
 
 def JM_put_script(annot_obj, key1, key2, value):
@@ -17153,8 +17155,14 @@ def ENSURE_OPERATION( pdf):
 
 
 def PyUnicode_DecodeRawUnicodeEscape(s, errors='strict'):
-    # fixme: handle escape sequencies
-    ret = s.decode('utf8', errors=errors)
+    # FIXED: handle raw unicode escape sequences
+    if not s:
+        return ""
+    if isinstance(s, str):
+        rc = s.encode("utf8", errors=errors)
+    elif isinstance(s, bytes):
+        rc = s[:]
+    ret = rc.decode('raw_unicode_escape', errors=errors)
     z = ret.find(chr(0))
     if z >= 0:
         ret = ret[:z]

diff --git a/src/extra.i b/src/extra.i
@@ -2554,34 +2554,37 @@ static int JM_rects_overlap(const fz_rect a, const fz_rect b)
 }
 
 //
-void ll_JM_print_stext_page_as_text(fz_output *out, fz_stext_page *page)
+void JM_append_rune(fz_buffer *buff, int ch);
+void ll_JM_print_stext_page_as_text(fz_buffer *res, fz_stext_page *page)
 {
     fz_stext_block *block;
     fz_stext_line *line;
     fz_stext_char *ch;
     fz_rect rect = page->mediabox;
     fz_rect chbbox;
     int last_char = 0;
-    char utf[10];
-    int i, n;
 
-    for (block = page->first_block; block; block = block->next) {
-        if (block->type == FZ_STEXT_BLOCK_TEXT) {
-            for (line = block->u.t.first_line; line; line = line->next) {
+
+    for (block = page->first_block; block; block = block->next)
+    {
+        if (block->type == FZ_STEXT_BLOCK_TEXT)
+        {
+            for (line = block->u.t.first_line; line; line = line->next)
+            {
                 last_char = 0;
-                for (ch = line->first_char; ch; ch = ch->next) {
+                for (ch = line->first_char; ch; ch = ch->next)
+                {
                     chbbox = JM_char_bbox(line, ch);
                     if (mupdf::ll_fz_is_infinite_rect(rect) ||
-                        JM_rects_overlap(rect, chbbox)) {
+                        JM_rects_overlap(rect, chbbox))
+                    {
                         last_char = ch->c;
-                        n = mupdf::ll_fz_runetochar(utf, ch->c);
-                        for (i = 0; i < n; i++) {
-                            mupdf::ll_fz_write_byte(out, utf[i]);
-                        }
+                        JM_append_rune(res, last_char);
                     }
                 }
-                if (last_char != 10 && last_char > 0) {
-                    mupdf::ll_fz_write_string(out, "\n");
+                if (last_char != 10 && last_char > 0)
+                {
+                    mupdf::ll_fz_append_string(res, "\n");
                 }
             }
         }
@@ -2592,11 +2595,11 @@ void ll_JM_print_stext_page_as_text(fz_output *out, fz_stext_page *page)
 // but lines within a block are concatenated by space instead a new-line
 // character (which else leads to 2 new-lines).
 //-----------------------------------------------------------------------------
-void JM_print_stext_page_as_text(mupdf::FzOutput& out, mupdf::FzStextPage& page)
+void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page)
 {
     if (0)
     {
-        return ll_JM_print_stext_page_as_text(out.m_internal, page.m_internal);
+        return ll_JM_print_stext_page_as_text(res.m_internal, page.m_internal);
     }
 
     fz_rect rect = page.m_internal->mediabox;
@@ -2616,14 +2619,12 @@ void JM_print_stext_page_as_text(mupdf::FzOutput& out, mupdf::FzStextPage& page)
                             )
                     {
                         last_char = ch.m_internal->c;
-                        char utf[10];
-                        int n = mupdf::ll_fz_runetochar(utf, ch.m_internal->c);
-                        mupdf::ll_fz_write_data( out.m_internal, utf, n);
+                        JM_append_rune(res.m_internal, last_char);
                     }
                 }
                 if (last_char != 10 && last_char > 0)
                 {
-                    mupdf::fz_write_string( out, "\n");
+                    mupdf::ll_fz_append_string(res.m_internal, "\n");
                 }
             }
         }
@@ -3312,15 +3313,15 @@ static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char
 void JM_append_rune(fz_buffer *buff, int ch)
 {
     char text[32];
-    if (ch == 92)  // prevent accidental "\u", "\U"
+    if (ch == 92)  // prevent accidental "\u", "\U" sequences
     {
         mupdf::ll_fz_append_string(buff, "\\u005c");
     }
     else if ((ch >= 32 && ch <= 255) || ch == 10)
     {
         mupdf::ll_fz_append_byte(buff, ch);
     }
-    else if (ch >= 0xd800 && ch <= 0xdfff)  // surrogate Unicodes prohibited
+    else if (ch >= 0xd800 && ch <= 0xdfff)  // orphaned surrogate Unicodes
     {
         mupdf::ll_fz_append_string(buff, "\\ufffd");
     }
@@ -4363,7 +4364,7 @@ mupdf::FzDevice JM_new_texttrace_device(PyObject* out);
 fz_rect JM_char_bbox(const mupdf::FzStextLine& line, const mupdf::FzStextChar& ch);
 
 static fz_quad JM_char_quad( fz_stext_line *line, fz_stext_char *ch);
-void JM_print_stext_page_as_text(mupdf::FzOutput& out, mupdf::FzStextPage& page);
+void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page);
 
 void set_small_glyph_heights(int on);
 mupdf::FzRect JM_cropbox(mupdf::PdfObj& page_obj);

diff --git a/tests/resources/test_2553.pdf b/tests/resources/test_2553.pdf