-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_ia-compress-ocr_workflow_AGPL-v3.sh
49 lines (47 loc) · 1.97 KB
/
pdf_ia-compress-ocr_workflow_AGPL-v3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# This bash script (c) 2024 Tyler D. Davis, Licensed under AGPL-v3
#!/bin/bash
#The input PDF is the first argument
INPUT_PDF=$1
INPUT_DIR=$(dirname $INPUT_PDF)"/"
BASENAME=$(basename $INPUT_PDF)
PDF_BASE="${BASENAME%.*}"
TMP_DIR=/tmp/
#DPI=300
OCR_LANG="eng"
#Perform OCR on PDF using ocrmypdf.
LOSS_OCR_TMP="$TMP_DIR""$PDF_BASE"-loss-ocr.pdf
echo "Now performing lossless OCR on input PDF..."
ocrmypdf --pdf-renderer sandwich -l $OCR_LANG -O1 --output-type pdf $INPUT_PDF $LOSS_OCR_TMP || { echo "OCR Failed. Sorry about that."; exit 1; }
echo "OCR completed successfully!"
#Generate necessary json and hocr file from PDF
TMP_JSON="$TMP_DIR"$PDF_BASE"_"$(date +%H%M)".json"
TMP_HOCR="$TMP_DIR"$PDF_BASE"_"$(date +%H%M)".hocr"
echo "Extracting necessary JSON metadata from OCR'd PDF"
pdf-metadata-json $LOSS_OCR_TMP 2>/dev/null > $TMP_JSON
# Check to see whether or not the file was successfully created.
echo "JSON for $LOSS_OCR_TMP generated successfully!"
if [ ! -f $TMP_JSON ]; then
echo "JSON file not created, for some reason!"; exit 1;
fi
echo "Generating necessary HOCR file before compressing OCR'd PDF"
pdf-to-hocr -f $LOSS_OCR_TMP -J $TMP_JSON > $TMP_HOCR
# Check to see whether or not the file was successfully created.
if [ ! -f $TMP_HOCR ]; then
echo "HOCR file not created, for some reason!"; exit 1;
fi
echo "HOCR file for $LOSS_OCR_TMP generated successfully!"
#Run recode_pdf on the file given the proper inputs
#First set variable for compressed PDF output.
PDF_COMP="$INPUT_DIR""$PDF_BASE"-ia-ocr.pdf
echo "Compressing OCR'd PDF with IA algorithms..."
recode_pdf -P $LOSS_OCR_TMP -T $TMP_HOCR -J openjpeg --fg-compression-flags='-r 200' --bg-downsample 1 -m 2 -o $PDF_COMP
# Check to see whether or not the file was successfully created.
if [ ! -f $PDF_COMP ]; then
echo "Compressed PDF creation failed :("; exit 1;
fi
echo "PDF compressed successfully!"
#Delete Temporary files
rm $LOSS_OCR_TMP $TMP_JSON $TMP_HOCR
echo "Enjoy your compressed OCR'd PDF!"
echo "File saved as $PDF_COMP"
exit