From a0fc8e0190b6a326d660864f88985715c1bcbad0 Mon Sep 17 00:00:00 2001 From: Sriranga Veeraraghavan Date: Mon, 25 Apr 2022 09:42:45 -0700 Subject: [PATCH] add fast mode, which may be useful for non-English text, such as German text --- README.txt | 21 ++++---- vocr.1 | 8 +++- vocr.m | 138 ++++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 122 insertions(+), 45 deletions(-) diff --git a/README.txt b/README.txt index dbe929d..3997de3 100644 --- a/README.txt +++ b/README.txt @@ -11,18 +11,22 @@ and derives its names from, the Vision framework (v for [V]ision). Usage: - vocr [-i [no|tab]] [-p] [-v] [-l [lang]] [files] + vocr [-v] [-f] [-p] [-i [no|tab]] [-l [lang]] [files] - If -i is specified with the 'no' option, vocr will not attempt - to indent any text that is OCR'ed. If -i is specified with the - 'tab' option, vocr will indent using tabs instead of spaces (by - default vocr indents using spaces). + If -v is specified, vocr runs in [v]erbose mode and outputs + errors and informational messages. + + If -f is specified, vocr uses the fast algorithm. This may be + useful when recognizing text in non-English languages, such as + German. If -p is specified, when OCR'ing a PDF, a page break (^L) will be inserted at the end of each page. - If -v is specified, vocr runs in [v]erbose mode and outputs - errors and informational messages. + If -i is specified with the 'no' option, vocr will not attempt + to indent any text that is OCR'ed. If -i is specified with the + 'tab' option, vocr will indent using tabs instead of spaces (by + default vocr indents using spaces). If -l is specified, on MacOSX 11.x (BigSur) and newer, vocr will ask the Vision framework to recognize the text in the @@ -34,8 +38,7 @@ Usage: 'it' - Italian 'pt' - Portuguese 'es' - Spanish - 'zh' - Simplified Chinese - 'zt' - Traditional Chinese + 'zh' - Chinese Build: diff --git a/vocr.1 b/vocr.1 index ac7f41f..c89a6fd 100644 --- a/vocr.1 +++ b/vocr.1 @@ -3,7 +3,7 @@ vocr - a simple utility for performing optical character recognition on images and PDFs .SH SYNOPSIS -vocr [-p] [-v] [-i [no|tab]] [-l [lang]] [files] +vocr [-p] [-v] [-f] [-i [no|tab]] [-l [lang]] [files] .SH DESCRIPTION vocr is a simple utility for performing optical character recognition (OCR) on images and PDFs. It prints any text found in the specified @@ -21,12 +21,16 @@ spaces). When OCR'ing a PDF, tells vocr to insert a page break (^L) at the end of each page. .TP +.B \-f +Use the [f]ast algorithm for ocr. This may be useful when recognizing +text in non-English languages, such as German. +.TP .B \-l [lang] If -l is specified, on MacOSX 11.x (BigSur) and newer, vocr will ask the Vision framework to recognize the text in the specified language. The supported language options are: 'de' (German), 'en' (English), 'fr' (French), 'it' (Italian), 'pt' (Portuguese), 'es' (Spanish), 'zh' -(Simplified Chinese), and 'zt' (Traditional Chinese). +(Chinese). .TP .B \-v Enables verbose mode - vocr will print out informational and/or error diff --git a/vocr.m b/vocr.m index 698f2fe..17a0b85 100644 --- a/vocr.m +++ b/vocr.m @@ -55,17 +55,19 @@ a copy of this software and associated documentation files (the /* command line options: + -f - use the [f]ast recognition algorithm -h - print usage / [h]elp -i [mode] - set the [i]ndent mode: - 'no' disables indenting - 'tab' indents with tabs (default is to use 4 spaces) - -l - specify the [l]anguage that the input is in (TODO) + 'no' - disables indenting + 'tab' - indents with tabs (default is to use 4 spaces) + -l - specify the [l]anguage to use for recognition -p - add a page break / [l]ine feed between pages -v - be [v]erbose */ enum { + gPgmOptFast = 'f', gPgmOptHelp = 'h', gPgmOptIndent = 'i', gPgmOptLang = 'l', @@ -75,35 +77,34 @@ a copy of this software and associated documentation files (the enum { - gLangChineseSimplified = 'c', /* zh-Hans */ - gLangGerman = 'd', /* de-DE */ - gLangEnglish = 'e', /* en-US */ - gLangFrench = 'f', /* fr-FR */ - gLangItalian = 'i', /* it-IT */ - gLangPortuguese = 'p', /* pt-BR */ - gLangSpanish = 's', /* es-ES */ - gLangChineseTraditional = 't' /* zh-Hant */ + gLangGerman = 'd', /* de-DE */ + gLangEnglish = 'e', /* en-US */ + gLangFrench = 'f', /* fr-FR */ + gLangItalian = 'i', /* it-IT */ + gLangPortuguese = 'p', /* pt-BR */ + gLangSpanish = 's', /* es-ES */ + gLangChinese = 'z', /* zh-Hans and zh-Hant */ }; -static const char *gPgmOpts = "hpvi:l:"; +static const char *gPgmOpts = "fhpvi:l:"; static const char *gPgmIndentNo = "no"; static const char *gPgmIndentTab = "tab"; static BOOL gQuiet = YES; -static const char *gPgmLangGerman = "de"; -static const char *gPgmLangEnglish = "en"; -static const char *gPgmLangFrench = "fr"; -static const char *gPgmLangItalian = "it"; -static const char *gPgmLangPortuguese = "pt"; -static const char *gPgmLangSpanish = "es"; -static const char *gPgmLangChineseSimplified = "zh"; -static const char *gPgmLangChineseTraditional = "zt"; +static const char *gPgmLangGerman = "de"; +static const char *gPgmLangEnglish = "en"; +static const char *gPgmLangFrench = "fr"; +static const char *gPgmLangItalian = "it"; +static const char *gPgmLangPortuguese = "pt"; +static const char *gPgmLangSpanish = "es"; +static const char *gPgmLangChinese = "zh"; /* ocr options */ typedef struct { BOOL addPageBreak; + BOOL fast; BOOL indent; BOOL indentWithTabs; int lang; @@ -135,14 +136,16 @@ static BOOL ocrImage(CGImageRef cgImage, static void printUsage(void) { fprintf(stderr, - "Usage: %s [-%c] | [-%c] [-%c] [-%c [%s|%s]] [files]\n", + "Usage: %s [-%c] | [-%c] [-%c] [-%c] [-%c [%s|%s]] [-%c [lang]] [files]\n", gPgmName, gPgmOptHelp, gPgmOptVerbose, + gPgmOptFast, gPgmOptPageBreak, gPgmOptIndent, gPgmIndentNo, - gPgmIndentTab); + gPgmIndentTab, + gPgmOptLang); } /* printError - print an error message */ @@ -339,8 +342,9 @@ static BOOL ocrImage(CGImageRef cgImage, unsigned int indentLevel = 0, k = 0; double prevStart = 0.0, prevEnd = 0.0; double curStart = 0.0, curEnd = 0.0; - BOOL indent = YES; + BOOL indent = YES, fast = NO, langCorrect = YES; NSString *indentStr = gIndentStr; + NSArray *langs = nil; #ifdef VOCR_IMG2TXT if (text == nil) @@ -352,11 +356,63 @@ static BOOL ocrImage(CGImageRef cgImage, if (opts != NULL) { + + /* is fast mode requested? */ + + fast = opts->fast; + + /* desired indent */ + indent = opts->indent; if (opts->indentWithTabs) { indentStr = @"\t"; } + + /* + on BigSur (11.x) and newer, try to set the + recognition language + */ + + if (@available(macos 11, *)) + { + switch(opts->lang) + { + case gLangGerman: + langs = [NSArray arrayWithObjects: @"de-DE", nil]; + break; + case gLangEnglish: + break; + case gLangFrench: + langs = [NSArray arrayWithObjects: @"fr-FR", nil]; + break; + case gLangItalian: + langs = [NSArray arrayWithObjects: @"it-IT", nil]; + break; + case gLangPortuguese: + langs = [NSArray arrayWithObjects: @"pt-BR", nil]; + break; + case gLangSpanish: + langs = [NSArray arrayWithObjects: @"es-ES", nil]; + break; + case gLangChinese: + langs = [NSArray arrayWithObjects: @"zh-Hans", + @"zh-Hant", + @"en-US", + nil]; + + /* + disable language correction for Chinese, see: + https://developer.apple.com/documentation/vision/recognizing_text_in_images + */ + + langCorrect = NO; + break; + default: + langs = nil; + break; + } + } } #ifdef VOCR_IMG2TXT @@ -402,18 +458,28 @@ static BOOL ocrImage(CGImageRef cgImage, } /* - enable accurate recognition and language correction + enable fast/accurate recognition and language correction https://developer.apple.com/documentation/vision/vnrequesttextrecognitionlevel?language=objc https://developer.apple.com/documentation/vision/vnrecognizetextrequest/3166773-useslanguagecorrection?language=objc */ - [request setRecognitionLevel: - VNRequestTextRecognitionLevelAccurate]; - [request setUsesLanguageCorrection: YES]; + if (fast) + { + [request setRecognitionLevel: + VNRequestTextRecognitionLevelFast]; + } + else + { + [request setRecognitionLevel: + VNRequestTextRecognitionLevelAccurate]; + } + + [request setUsesLanguageCorrection: langCorrect]; /* use the version 2 algorithm on MacOSX 11+, which supports - multiple languages: + multiple languages, and, if an alternate language is requested + set that as well: https://developer.apple.com/documentation/vision/vnrecognizetextrequestrevision2?language=objc https://stackoverflow.com/questions/63813709 @@ -422,6 +488,10 @@ static BOOL ocrImage(CGImageRef cgImage, if (@available(macos 11, *)) { [request setRevision: VNRecognizeTextRequestRevision2]; + if (langs != nil) + { + [request setRecognitionLanguages: langs]; + } } else { @@ -893,6 +963,7 @@ int main(int argc, char * const argv[]) return 1; } + options.fast = NO; options.addPageBreak = NO; options.indent = YES; options.indentWithTabs = NO; @@ -905,6 +976,9 @@ int main(int argc, char * const argv[]) case gPgmOptHelp: optHelp = YES; break; + case gPgmOptFast: + options.fast = YES; + break; case gPgmOptPageBreak: options.addPageBreak = YES; break; @@ -952,13 +1026,9 @@ int main(int argc, char * const argv[]) { options.lang = gLangSpanish; } - else if (strcmp(optarg, gPgmLangChineseSimplified) == 0) - { - options.lang = gLangChineseSimplified; - } - else if (strcmp(optarg, gPgmLangChineseTraditional) == 0) + else if (strcmp(optarg, gPgmLangChinese) == 0) { - options.lang = gLangChineseTraditional; + options.lang = gLangChinese; } else {