From c83385ed95f0105d414f9f523a1df3226f720059 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 13 Nov 2024 22:08:50 +0000 Subject: [PATCH] process-text: improve corrupt PDF handling --- xklb/mediafiles/process_media.py | 3 ++- xklb/mediafiles/process_text.py | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/xklb/mediafiles/process_media.py b/xklb/mediafiles/process_media.py index c555c045..56b80895 100644 --- a/xklb/mediafiles/process_media.py +++ b/xklb/mediafiles/process_media.py @@ -389,7 +389,8 @@ def process_media() -> None: with suppress(processes.UnplayableFile): m["duration"] = processes.FFProbe(new_path).duration - new_free_space += (m.get("compressed_size") or m["size"]) - m["new_size"] + if not os.path.exists(m["path"]): + new_free_space += (m.get("compressed_size") or m["size"]) - m["new_size"] if args.database: with suppress(sqlite3.OperationalError), args.db.conn: diff --git a/xklb/mediafiles/process_text.py b/xklb/mediafiles/process_text.py index 5eb03fd3..dd84b07a 100644 --- a/xklb/mediafiles/process_text.py +++ b/xklb/mediafiles/process_text.py @@ -125,8 +125,13 @@ def process_path(args, path): if args.simulate: log.info("Running OCR on %s", path) else: - if not ocrmypdf.pdfa.file_claims_pdfa(Path(path))["pass"]: - path = convert_to_text_pdf(args, path) + import pikepdf + + try: + if not ocrmypdf.pdfa.file_claims_pdfa(Path(path))["pass"]: + path = convert_to_text_pdf(args, path) + except pikepdf.PdfError: + log.exception("[%s]: could not open as PDF", path) ext = path_utils.ext(path)