Skip to content

Commit

Permalink
process-text: improve corrupt PDF handling
Browse files Browse the repository at this point in the history
  • Loading branch information
chapmanjacobd committed Nov 13, 2024
1 parent bb3e8a9 commit c83385e
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 3 deletions.
3 changes: 2 additions & 1 deletion xklb/mediafiles/process_media.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,8 @@ def process_media() -> None:
with suppress(processes.UnplayableFile):
m["duration"] = processes.FFProbe(new_path).duration

new_free_space += (m.get("compressed_size") or m["size"]) - m["new_size"]
if not os.path.exists(m["path"]):
new_free_space += (m.get("compressed_size") or m["size"]) - m["new_size"]

if args.database:
with suppress(sqlite3.OperationalError), args.db.conn:
Expand Down
9 changes: 7 additions & 2 deletions xklb/mediafiles/process_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,13 @@ def process_path(args, path):
if args.simulate:
log.info("Running OCR on %s", path)
else:
if not ocrmypdf.pdfa.file_claims_pdfa(Path(path))["pass"]:
path = convert_to_text_pdf(args, path)
import pikepdf

try:
if not ocrmypdf.pdfa.file_claims_pdfa(Path(path))["pass"]:
path = convert_to_text_pdf(args, path)
except pikepdf.PdfError:
log.exception("[%s]: could not open as PDF", path)

ext = path_utils.ext(path)

Expand Down

0 comments on commit c83385e

Please sign in to comment.