Skip to content

Commit

Permalink
Specify language for text extraction
Browse files Browse the repository at this point in the history
For uniformity with Tesseract execution, Tika extraction will raise
if the language extension is not available

Close #302
  • Loading branch information
tagliala committed Sep 23, 2024
1 parent c2c1ba6 commit ac1fac8
Show file tree
Hide file tree
Showing 12 changed files with 149 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ tmp/*

# Docker
docker/colore/variables.env

# Generated Tika configurations for language-specific OCR
config/tika/ocr
3 changes: 3 additions & 0 deletions config/app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ libreoffice_path: <%= ENV['LIBREOFFICE_PATH'] %>
tesseract_path: <%= ENV['TESSERACT_PATH'] %>
tika_path: <%= ENV['TIKA_PATH'] %>
wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %>

# Other settings
tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %>
wkhtmltopdf_params: '-d 100 --encoding UTF-8'
1 change: 1 addition & 0 deletions lib/colore.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
require_relative 'document'
require_relative 'heathen'
require_relative 'sidekiq_workers'
require_relative 'tika_config'
5 changes: 5 additions & 0 deletions lib/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ class C_
attr_accessor :tika_path
# Path to the wkhtmltopdf binary
attr_accessor :wkhtmltopdf_path

# Path to the tika config directory
attr_accessor :tika_config_directory
# Params for wkhtmltopdf
attr_accessor :wkhtmltopdf_params

Expand All @@ -65,6 +68,8 @@ def self.config
c.tesseract_path = yaml['tesseract_path'] || 'tesseract'
c.tika_path = yaml['tika_path'] || 'tika'
c.wkhtmltopdf_path = yaml['wkhtmltopdf_path'] || 'wkhtmltopdf'

c.tika_config_directory = yaml['tika_config_directory'] || '../config/tika'
c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || ''

c
Expand Down
1 change: 1 addition & 0 deletions lib/heathen/processor_methods/libreoffice.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def libreoffice(format:)
if to_suffix == 'txt'
executioner.execute(
Colore::C_.tika_path,
"--config=#{Colore::TikaConfig.path_for(job.language)}",
'--text',
job.content_file,
binary: true
Expand Down
1 change: 1 addition & 0 deletions lib/heathen/processor_methods/pdftotext.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def pdftotext

executioner.execute(
Colore::C_.tika_path,
"--config=#{Colore::TikaConfig.path_for(job.language)}",
'--text',
job.content_file,
binary: true
Expand Down
61 changes: 61 additions & 0 deletions lib/tika_config.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# frozen_string_literal: true

require 'fileutils'
require 'pathname'

module Colore
# The Colore Tika is a module to help with Tika-related configuration files.
module TikaConfig
# The configuration template version
VERSION = 'v1'

# The default language to use when the language has not been found
DEFAULT_LANGUAGE = 'eng'

# Config template
TEMPLATE = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser"></parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="language" type="string">%<language_alpha3>s</param>
</params>
</parser>
</parsers>
</properties>
XML

class << self
private

def tika_config_path
Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__)
end

def path_for!(language_alpha3)
file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml")
return file if file.file?

FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION))
File.write(file, format(TEMPLATE, language_alpha3: language_alpha3))
file
end
end

# Returns the file path of the Tika configuration for performing OCR
# detection in a specified language.
#
# @param [String] language The language code in either ISO 639-1 (two-letter) or ISO 639-2 (three-letter) format.
# Supported languages are those with corresponding Tika configuration files.
#
# @return [Pathname] The path to the Tika configuration file for the specified language or
# the configuration file for DEFAULT_LANGUAGE if the language is not found.
def self.path_for(language)
language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE

path_for!(language_alpha3)
end
end
end
Binary file added spec/fixtures/heathen/quickfox.ar.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added spec/fixtures/heathen/quickfox.ar.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions spec/fixtures/heathen/quickfox.ar.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
الثعلب البني السريع مفتون بالكلاب الكسولة
15 changes: 14 additions & 1 deletion spec/heathen/processor_methods/pdftotext_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

RSpec.describe Heathen::Processor do
let(:content) { fixture('heathen/quickfox.pdf').read }
let(:job) { Heathen::Job.new 'foo', content, 'en' }
let(:job) { Heathen::Job.new 'foo', content, language }
let(:language) { 'en' }
let(:processor) { described_class.new job: job, logger: spec_logger }

after do
Expand All @@ -14,7 +15,19 @@
describe '#pdftotext' do
it 'converts PDF to TXT' do
processor.pdftotext
expect(job.content).to eq 'The quick brown fox jumps lazily over the dog'
expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii'
end

context 'with Arabic files' do
let(:content) { fixture('heathen/quickfox.ar.pdf').read }
let(:language) { 'ar' }

it 'extracts Arabic text from images' do
processor.pdftotext
expect(job.content).to eq fixture('heathen/quickfox.ar.txt').read.strip.force_encoding(Encoding::ASCII_8BIT)
expect(job.content.mime_type).to eq 'text/plain; charset=utf-8'
end
end
end
end
59 changes: 59 additions & 0 deletions spec/lib/tika_config_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# frozen_string_literal: true

require 'spec_helper'
require 'fileutils'
require 'pathname'

RSpec.describe Colore::TikaConfig do
let(:tika_config_directory) { '../tmp/tika' }
let(:tika_test_config_path) { Pathname.new(File.expand_path('../../tmp/tika', __dir__)) }

before do
allow(Colore::C_.config).to receive(:tika_config_directory).and_return tika_config_directory
FileUtils.mkdir_p tika_test_config_path
FileUtils.rm_rf tika_test_config_path
end

after do
FileUtils.rm_rf tika_test_config_path
end

describe '.path_for' do
subject(:path_for) { described_class.path_for(language) }

context 'when the language is found' do
let(:language) { 'fr' }

before do
allow(Colore::Utils).to receive(:language_alpha3).with('fr').and_return('fra')
end

it 'returns the correct configuration file path' do
expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, 'tika.fra.xml')
end
end

context 'when the language is not found' do
let(:language) { 'unknown' }

it 'returns the default configuration file path' do
expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, "tika.#{described_class::DEFAULT_LANGUAGE}.xml")
end
end

context 'when the configuration file is already present' do
let(:language) { 'en' }

before do
allow(File).to receive(:write)
.with(tika_test_config_path.join('ocr', described_class::VERSION, 'tika.eng.xml'), an_instance_of(String))
.and_call_original
end

it 'does not overwrite it' do
2.times { described_class.path_for(language) }
expect(File).to have_received(:write).once
end
end
end
end

0 comments on commit ac1fac8

Please sign in to comment.