Skip to content

Commit

Permalink
✨ add support for image & PDF compression (#120)
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastianMindee authored Oct 18, 2024
1 parent ae952d5 commit 1660d46
Show file tree
Hide file tree
Showing 18 changed files with 587 additions and 25 deletions.
10 changes: 10 additions & 0 deletions lib/mindee.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,16 @@ module Source
end
end

module Image
# Miscellaneous image operations.
module ImageUtils
end

# Image compressor module to handle image compression.
module ImageCompressor
end
end

# Custom extraction module
module Extraction
end
Expand Down
1 change: 1 addition & 0 deletions lib/mindee/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def parse_queued(
end

# rubocop:disable Metrics/ParameterLists

# Enqueue a document for async parsing and automatically try to retrieve it
#
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
Expand Down
1 change: 0 additions & 1 deletion lib/mindee/extraction/common/extracted_image.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def save_to_file(output_path, file_format = nil)
image = MiniMagick::Image.read(@buffer)
image.format file_format.downcase
image.write resolved_path.to_s
logger.info("File saved successfully to '#{resolved_path}'.")
rescue TypeError
raise 'Invalid path/filename provided.'
rescue StandardError
Expand Down
29 changes: 7 additions & 22 deletions lib/mindee/extraction/common/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ module Mindee
# Image Extraction Module.
module Extraction
# Image Extraction wrapper class.
class ImageExtractor
def self.attach_image_as_new_file(input_buffer)
module ImageExtractor
def self.attach_image_as_new_file(input_buffer, format: 'jpg')
# Attaches an image as a new page in a PdfDocument object.
#
# @param [StringIO] input_buffer Input buffer. Only supports JPEG.
Expand All @@ -21,9 +21,9 @@ def self.attach_image_as_new_file(input_buffer)
magick_image = MiniMagick::Image.read(input_buffer)
# NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
# converted.
magick_image.format('jpg')
magick_image.format(format)
original_density = magick_image.resolution
scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
# the pdf otherwise the resulting image shrinks.
magick_image.format('pdf', 0, { density: scale_factor.to_s })
Origami::PDF.read(StringIO.new(magick_image.to_blob))
Expand All @@ -37,27 +37,12 @@ def self.attach_image_as_new_file(input_buffer)
# to extract.
# @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
def self.extract_multiple_images_from_source(input_source, page_id, polygons)
new_stream = load_doc(input_source, page_id)
new_stream = load_input_source_pdf_page_as_image(input_source, page_id)
new_stream.seek(0)

extract_images_from_polygons(input_source, new_stream, page_id, polygons)
end

# Retrieves a PDF document's page.
#
# @param [Origami::PDF] pdf_doc Origami PDF handle.
# @param [Integer] page_id Page ID.
def self.get_page(pdf_doc, page_id)
stream = StringIO.new
pdf_doc.save(stream)

options = {
page_indexes: [page_id - 1],
}

Mindee::PDF::PdfProcessor.parse(stream, options)
end

# Extracts images from their positions on a file (as polygons).
#
# @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
Expand Down Expand Up @@ -179,10 +164,10 @@ def self.create_extracted_image(buffer, file_name, page_id, element_id)
# @param input_file [LocalInputSource] Local input.
# @param [Integer] page_id Page ID.
# @return [MiniMagick::Image] A valid PdfDocument handle.
def self.load_doc(input_file, page_id)
def self.load_input_source_pdf_page_as_image(input_file, page_id)
input_file.io_stream.rewind
if input_file.pdf?
get_page(Origami::PDF.read(input_file.io_stream), page_id)
Mindee::PDF::PdfProcessor.get_page(Origami::PDF.read(input_file.io_stream), page_id)
else
input_file.io_stream
end
Expand Down
2 changes: 2 additions & 0 deletions lib/mindee/extraction/pdf_extractor/pdf_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def extract_sub_documents(page_indexes)

# rubocop:disable Metrics/CyclomaticComplexity
# rubocop:disable Metrics/PerceivedComplexity

# Extracts invoices as complete PDFs from the document.
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
# @param strict [Boolean]
Expand Down Expand Up @@ -99,6 +100,7 @@ def extract_invoices(page_indexes, strict: false)
end
extract_sub_documents(correct_page_indexes)
end

# rubocop:enable Metrics/CyclomaticComplexity
# rubocop:enable Metrics/PerceivedComplexity

Expand Down
1 change: 1 addition & 0 deletions lib/mindee/extraction/tax_extractor/tax_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def self.extract_horizontal_tax(ocr_result, tax_names)
end
candidates
end

# rubocop:enable Metrics/CyclomaticComplexity
# rubocop:enable Metrics/PerceivedComplexity

Expand Down
3 changes: 2 additions & 1 deletion lib/mindee/geometry/point.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ class Point
# @return [Float]
attr_accessor :y

# rubocop:disable Naming/MethodParameterName

# @param x [Float]
# @param y [Float]
# rubocop:disable Naming/MethodParameterName
def initialize(x, y)
@x = x
@y = y
Expand Down
4 changes: 4 additions & 0 deletions lib/mindee/image.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# frozen_string_literal: true

require_relative 'image/image_compressor'
require_relative 'image/image_utils'
29 changes: 29 additions & 0 deletions lib/mindee/image/image_compressor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# frozen_string_literal: true

module Mindee
# Image processing module.
module Image
# Image compressor module to handle image compression.
module ImageCompressor
# Resize and/or compress an SKBitmap. This assumes the ratio was provided before hands.
# @param image [MiniMagick::Image, StringIO] Input image.
# @param quality [Integer, nil] Quality of the final file.
# @param max_width [Integer, nil] Maximum width. If not specified, the horizontal ratio will remain the same.
# @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
# @return [StringIO]
def self.compress_image(image, quality: 85, max_width: nil, max_height: nil)
processed_image = ImageUtils.to_image(image)
processed_image.format 'jpg'
final_width, final_height = ImageUtils.calculate_new_dimensions(
processed_image,
max_width: max_width,
max_height: max_height
)
ImageUtils.resize_image(processed_image, final_width, final_height) if final_width || final_height
ImageUtils.compress_image_quality(processed_image, quality)

ImageUtils.image_to_stringio(processed_image)
end
end
end
end
104 changes: 104 additions & 0 deletions lib/mindee/image/image_utils.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# frozen_string_literal: true

module Mindee
# Image processing module.
module Image
# Miscellaneous image operations.
module ImageUtils
# Resizes a provided MiniMagick Image with the given width & height, if present.
# @param image [MiniMagick::Image] MiniMagick image handle.
# @param width [Integer] Width to comply with.
# @param height [Integer] Height to comply with.
def self.resize_image(image, width, height)
if width && height
image.resize "#{width}x#{height}"
elsif width
image.resize width.to_s
elsif height
image.resize "x#{height}"
end
end

# Compresses the quality of the provided MiniMagick image.
# @param image [MiniMagick::Image] MiniMagick image handle.
# @param quality [Integer] Quality to apply to the image. This is independent from a JPG's base quality.
def self.compress_image_quality(image, quality)
image.quality quality.to_s
end

# Mostly here so that IDEs don't get confused on the type (@type annotation fails sometimes.)
# @param [MiniMagick::Image, StringIO, File, Tempfile] image The input image
# @return [MiniMagick::Image]
def self.to_image(image)
if image.respond_to?(:read) && image.respond_to?(:rewind)
image.rewind
MiniMagick::Image.read(image)
elsif image.is_a?(MiniMagick::Image)
image
else
raise "Expected an I/O object or a MiniMagick::Image. '#{image.class}' given instead."
end
end

# Converts a StringIO containing an image into a MiniMagick image.
# @param image [MiniMagick::Image] the input image.
# @param format [String] Format parameter, left open for the future, but should be JPEG for current use-cases.
# @return [StringIO]
def self.image_to_stringio(image, format = 'JPEG')
image.format format
blob = image.to_blob
stringio = StringIO.new(blob)
stringio.rewind

stringio
end

# Computes the new dimensions for a given SKBitmap, and returns a scaled down version of it relative to the
# provided bounds.
# @param [MiniMagick::Image] original Input MiniMagick image.
# @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same.
# @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
def self.calculate_new_dimensions(original, max_width: nil, max_height: nil)
raise 'Provided image could not be processed for resizing.' if original.nil?

return [original.width, original.height] if max_width.nil? && max_height.nil?

width_ratio = max_width ? max_width.to_f / original.width : Float::INFINITY
height_ratio = max_height ? max_height.to_f / original.height : Float::INFINITY

scale_factor = [width_ratio, height_ratio].min

new_width = (original.width * scale_factor).to_i
new_height = (original.height * scale_factor).to_i

[new_width, new_height]
end

# Computes the Height & Width from a page's media box. Falls back to the size of the initial image.
# @param image [MiniMagick::Image] The initial image that will fit into the page.
# @param media_box [Array<Integer>, nil]
# @return [Array<Integer>]
def self.calculate_dimensions_from_media_box(image, media_box)
if !media_box.nil? && media_box.any?
[
media_box[2]&.to_i || image[:width].to_i,
media_box[3]&.to_i || image[:height].to_i,
]
else
[image[:width].to_i, image[:height].to_i]
end
end

# Transforms a PDF into a MagickImage. This is currently used for single-page PDFs.
# @param pdf_stream [StringIO] Input stream.
# @param image_quality [Integer] Quality to apply to the image.
# @return [MiniMagick::Image]
def self.pdf_to_magick_image(pdf_stream, image_quality)
compressed_image = MiniMagick::Image.read(pdf_stream.read)
compressed_image.format('jpg')
compressed_image.quality image_quality.to_s
compressed_image
end
end
end
end
36 changes: 36 additions & 0 deletions lib/mindee/input/sources.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
require 'marcel'

require_relative '../pdf'
require_relative '../image'

module Mindee
module Input
Expand Down Expand Up @@ -126,6 +127,41 @@ def count_pdf_pages
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
pdf_processor.pages.size
end

# Compresses the file, according to the provided info.
# @param [Integer] quality Quality of the output file.
# @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
# @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
# @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
# This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
# WARNING: this operation is strongly discouraged.
# @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
# not. Needs force_source_text to work.
def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
buffer = if pdf?
Mindee::PDF::PDFCompressor.compress_pdf(
@io_stream,
quality: quality,
force_source_text_compression: force_source_text,
disable_source_text: disable_source_text
)
else
Mindee::Image::ImageCompressor.compress_image(
@io_stream,
quality: quality,
max_width: max_width,
max_height: max_height
)
end
@io_stream = buffer
@io_stream.rewind
end

# Checks whether the file has source text if it is a pdf. False otherwise
# @return [Boolean] True if the file is a PDF and has source text.
def source_text?
Mindee::PDF::PDFTools.source_text?(@io_stream)
end
end

# Load a document from a path.
Expand Down
3 changes: 3 additions & 0 deletions lib/mindee/parsing/standard/position_field.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def initialize(prediction, page_id)

# rubocop:disable Metrics/CyclomaticComplexity
# rubocop:disable Metrics/PerceivedComplexity

# String representation.
# @return [String]
def to_s
return "Polygon with #{@polygon.size} points." if @polygon&.size&.positive?
Expand All @@ -40,6 +42,7 @@ def to_s

''
end

# rubocop:enable Metrics/CyclomaticComplexity
# rubocop:enable Metrics/PerceivedComplexity

Expand Down
4 changes: 3 additions & 1 deletion lib/mindee/pdf.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require_relative 'pdf/pdf_processing'
require_relative 'pdf/pdf_compressor'
require_relative 'pdf/pdf_processor'
require_relative 'pdf/pdf_tools'
Loading

0 comments on commit 1660d46

Please sign in to comment.