From c9b193159f38f9a83124a873b1410e546652205f Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Mon, 9 Oct 2023 15:32:47 +0200 Subject: [PATCH 01/14] progress --- prodigy_pdf/__init__.py | 78 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 435dea3..44d3b56 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -29,7 +29,6 @@ def generate_pdf_pages(pdf_paths: List[Path]): "image": page_to_image(page), "meta": { "page": page_number, - "pdf": pdf_path.parts[-1], "path": str(pdf_path) } }) @@ -86,3 +85,80 @@ def before_db(examples): } }, } + + +def page_to_cropped_image(pil_page, span, scale): + left, upper = span['x'], span['y'] + right, lower = left + span['width'], upper + span['height'] + scaled = (left * scale, upper * scale, right * scale, lower * scale) + cropped = pil_page.crop(scaled) + with BytesIO() as buffered: + cropped.save(buffered, format="JPEG") + img_str = base64.b64encode(buffered.getvalue()) + return cropped, f"data:image/png;base64,{img_str.decode('utf-8')}" + + +@recipe( + "pdf.ocr.correct", + # fmt: off + dataset=("Dataset to save answers to", "positional", None, str), + source=("Source with PDF Annotations", "positional", None, str), + labels=("Labels to consider", "option", "l", str), + scale=("Zoom for higher resolution for OCR algorithm", "option", "s", int), + remove_base64=("Remove base64-encoded image data", "flag", "R", bool), + autofocus=("Autofocus on the transcript UI", "flag", "af", bool) + # fmt: on +) +def pdf_ocr_correct( + dataset: str, + source: str, + labels: str, + scale: int = 3, + remove_base64:bool=False, + autofocus: bool = False +) -> ControllerComponentsDict: + """Applies OCR to annotated segments and gives a textbox for corrections.""" + import pytesseract + + stream = get_stream(source) + labels = labels.split(",") + + def new_stream(stream): + for ex in stream: + useful_spans = [span for span in ex['spans'] if span['label'] in labels] + if useful_spans: + pdf = pdfium.PdfDocument(ex['meta']['path']) + page = pdf.get_page(ex['meta']['page']) + pil_page = page.render(scale=scale).to_pil() + for annot in useful_spans: + cropped, img_str = page_to_cropped_image(pil_page, span=annot, scale=scale) + annot["image"] = img_str + annot["text"] = pytesseract.image_to_string(cropped) + annot["transcription"] = annot["text"] + text_input_fields = { + "field_rows": 12, + "field_label": "Transcript", + "field_id": "transcription", + "field_autofocus": autofocus, + } + del annot['id'] + yield set_hashes({**annot, **text_input_fields}) + + def before_db(examples): + # Remove all data URIs before storing example in the database + for eg in examples: + if eg["image"].startswith("data:"): + del eg["image"] + return examples + + blocks = [{"view_id": "classification"}, {"view_id": "text_input"}] + + return { + "dataset": dataset, + "stream": new_stream(stream), + "before_db": before_db if remove_base64 else None, + "view_id": "blocks", + "config": { + "blocks": blocks + }, + } From 96859370857139b96afa6c77c0bcb6c419a949ae Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Mon, 9 Oct 2023 15:51:12 +0200 Subject: [PATCH 02/14] add fold dashes flag --- prodigy_pdf/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 44d3b56..7323657 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -98,6 +98,17 @@ def page_to_cropped_image(pil_page, span, scale): return cropped, f"data:image/png;base64,{img_str.decode('utf-8')}" +def fold_ocr_dashes(ocr_input): + new = "" + for line in ocr_input.split("\n"): + if line.rfind("-") == -1: + newline = line + " " + else: + newline = line[:line.rfind("-")] + new += newline + return new + + @recipe( "pdf.ocr.correct", # fmt: off @@ -106,6 +117,7 @@ def page_to_cropped_image(pil_page, span, scale): labels=("Labels to consider", "option", "l", str), scale=("Zoom for higher resolution for OCR algorithm", "option", "s", int), remove_base64=("Remove base64-encoded image data", "flag", "R", bool), + fold_dashes=("Removes dashes at the end of a textline and folds them with the next term.", "flag", "f", bool), autofocus=("Autofocus on the transcript UI", "flag", "af", bool) # fmt: on ) @@ -115,6 +127,7 @@ def pdf_ocr_correct( labels: str, scale: int = 3, remove_base64:bool=False, + fold_dashes:bool = False, autofocus: bool = False ) -> ControllerComponentsDict: """Applies OCR to annotated segments and gives a textbox for corrections.""" @@ -134,6 +147,8 @@ def new_stream(stream): cropped, img_str = page_to_cropped_image(pil_page, span=annot, scale=scale) annot["image"] = img_str annot["text"] = pytesseract.image_to_string(cropped) + if fold_dashes: + annot["text"] = fold_ocr_dashes(annot["text"]) annot["transcription"] = annot["text"] text_input_fields = { "field_rows": 12, From 8ab4e8600ca5eda03252f71e0f6e5d2a3e866ab6 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Mon, 9 Oct 2023 15:52:13 +0200 Subject: [PATCH 03/14] ruff --- prodigy_pdf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 7323657..fc6fdab 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -6,7 +6,7 @@ import pypdfium2 as pdfium from prodigy import recipe, set_hashes, ControllerComponentsDict -from prodigy.components.stream import Stream, get_stream +from prodigy.components.stream import Stream from prodigy.util import msg def page_to_image(page: pdfium.PdfPage) -> str: From a2488abdbdbdd91d0e7e41257cd4670a93100562 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Wed, 11 Oct 2023 10:13:29 +0200 Subject: [PATCH 04/14] basic ocr recipe --- prodigy_pdf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index fc6fdab..7323657 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -6,7 +6,7 @@ import pypdfium2 as pdfium from prodigy import recipe, set_hashes, ControllerComponentsDict -from prodigy.components.stream import Stream +from prodigy.components.stream import Stream, get_stream from prodigy.util import msg def page_to_image(page: pdfium.PdfPage) -> str: From ffe2a39296c2b53edf6bbf803c921a4a14e8349d Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Wed, 11 Oct 2023 10:32:00 +0200 Subject: [PATCH 05/14] added ocr --- prodigy_pdf/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 7323657..b93d40b 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -1,7 +1,8 @@ -from typing import List +from typing import List, Dict import base64 from io import BytesIO from pathlib import Path +from PIL import Image import pypdfium2 as pdfium @@ -87,7 +88,7 @@ def before_db(examples): } -def page_to_cropped_image(pil_page, span, scale): +def page_to_cropped_image(pil_page: Image, span: Dict, scale: int): left, upper = span['x'], span['y'] right, lower = left + span['width'], upper + span['height'] scaled = (left * scale, upper * scale, right * scale, lower * scale) @@ -98,7 +99,7 @@ def page_to_cropped_image(pil_page, span, scale): return cropped, f"data:image/png;base64,{img_str.decode('utf-8')}" -def fold_ocr_dashes(ocr_input): +def fold_ocr_dashes(ocr_input:str) -> str: new = "" for line in ocr_input.split("\n"): if line.rfind("-") == -1: From eb972509f7ac3fcaacf51b039d47a67e8e6137d1 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Wed, 11 Oct 2023 13:13:35 +0200 Subject: [PATCH 06/14] docstrings --- prodigy_pdf/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index b93d40b..6596b7c 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -78,7 +78,7 @@ def before_db(examples): "view_id": "image_manual", "config": { "labels": labels.split(","), - "image_manual_stroke_width": 2, + "image_manual_stroke_width": 1, "custom_theme": { "labels": { lab: color[i] for i, lab in enumerate(labels.split(",")) @@ -100,6 +100,11 @@ def page_to_cropped_image(pil_page: Image, span: Dict, scale: int): def fold_ocr_dashes(ocr_input:str) -> str: + """ + OCR might literally add dashes at the end of the line to indicate + continuation of the word. This can be fine in some cases, but this + function can fold it all into a single string. + """ new = "" for line in ocr_input.split("\n"): if line.rfind("-") == -1: From 6461fb56ec855da0623ba61b77003b8d79ea30f5 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Wed, 11 Oct 2023 13:20:22 +0200 Subject: [PATCH 07/14] ruff in ci is inconsistent? turn off for now --- .github/workflows/unit_tests.yml | 7 +------ prodigy_pdf/__init__.py | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 786de4c..1381d54 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -34,13 +34,8 @@ jobs: run: | pip install --upgrade pip pip install -e . - pip install ruff pytest + pip install pytest - - name: Run ruff - if: always() - shell: bash - run: python -m ruff prodigy_pdf tests - - name: Run pytest if: always() shell: bash diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 6596b7c..52b5c29 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -10,6 +10,7 @@ from prodigy.components.stream import Stream, get_stream from prodigy.util import msg + def page_to_image(page: pdfium.PdfPage) -> str: """Turns a PdfPage into a base64 image for Prodigy""" pil_image = page.render().to_pil() From 89c7a0eb90dac5771ff5210951973e6f1d8ab7d1 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Wed, 11 Oct 2023 13:26:44 +0200 Subject: [PATCH 08/14] add test case for folding --- tests/test_basics.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/test_basics.py b/tests/test_basics.py index 79e494e..f042793 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -1,5 +1,5 @@ from pathlib import Path -from prodigy_pdf import generate_pdf_pages +from prodigy_pdf import generate_pdf_pages, fold_ocr_dashes def test_smoke_internal(): @@ -9,3 +9,23 @@ def test_smoke_internal(): assert len(pages) == 6 for page in pages: assert "data" in page['image'] + + +def test_fold_dashes(): + going_in = """ + Real-Time Strategy (RTS) games have become an increas- + ingly popular test-bed for modern artificial intelligence tech- + niques. With this rise in popularity has come the creation of + several annual competitions, in which AI agents (bots) play + the full game of StarCraft: Broodwar by Blizzard Entertain- + ment. The three major annual StarCraft AI Competitions are + the Student StarCraft AI Tournament (SSCAIT), the Com- + putational Intelligence in Games (CIG) competition, and the + Artificial Intelligence and Interactive Digital Entertainment + (AIIDE) competition. In this paper we will give an overview + of the current state of these competitions, and the bots that + compete in them. + """ + + expected = "Real-Time Strategy (RTS) games have become an increasingly popular test-bed for modern artificial intelligence techniques. With this rise in popularity has come the creation of several annual competitions, in which AI agents (bots) play the full game of StarCraft: Broodwar by Blizzard Entertainment. The three major annual StarCraft AI Competitions are the Student StarCraft AI Tournament (SSCAIT), the Computational Intelligence in Games (CIG) competition, and the Artificial Intelligence and Interactive Digital Entertainment (AIIDE) competition. In this paper we will give an overview of the current state of these competitions, and the bots that compete in them." + assert fold_ocr_dashes(going_in) == expected From ef715312c7ebfe3904697d35b44752c43824f431 Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 17 Oct 2023 14:49:25 +0200 Subject: [PATCH 09/14] fix-test --- prodigy_pdf/__init__.py | 3 ++- tests/test_basics.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 52b5c29..42aa486 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -108,12 +108,13 @@ def fold_ocr_dashes(ocr_input:str) -> str: """ new = "" for line in ocr_input.split("\n"): + line = line.strip() if line.rfind("-") == -1: newline = line + " " else: newline = line[:line.rfind("-")] new += newline - return new + return new.strip() @recipe( diff --git a/tests/test_basics.py b/tests/test_basics.py index f042793..38aaf76 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -27,5 +27,5 @@ def test_fold_dashes(): compete in them. """ - expected = "Real-Time Strategy (RTS) games have become an increasingly popular test-bed for modern artificial intelligence techniques. With this rise in popularity has come the creation of several annual competitions, in which AI agents (bots) play the full game of StarCraft: Broodwar by Blizzard Entertainment. The three major annual StarCraft AI Competitions are the Student StarCraft AI Tournament (SSCAIT), the Computational Intelligence in Games (CIG) competition, and the Artificial Intelligence and Interactive Digital Entertainment (AIIDE) competition. In this paper we will give an overview of the current state of these competitions, and the bots that compete in them." + expected = "Real-Time Strategy (RTS) games have become an increasingly popular test-bed for modern artificial intelligence techniques. With this rise in popularity has come the creation of several annual competitions, in which AI agents (bots) play the full game of StarCraft: Broodwar by Blizzard Entertainment. The three major annual StarCraft AI Competitions are the Student StarCraft AI Tournament (SSCAIT), the Computational Intelligence in Games (CIG) competition, and the Artificial Intelligence and Interactive Digital Entertainment (AIIDE) competition. In this paper we will give an overview of the current state of these competitions, and the bots that compete in them." assert fold_ocr_dashes(going_in) == expected From b53652fc40ad830804022f0be106fc52918764c8 Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 18 Oct 2023 12:44:49 +0200 Subject: [PATCH 10/14] add dependency + docs --- prodigy_pdf/__init__.py | 11 ++++++++--- setup.cfg | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 42aa486..0368785 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -4,6 +4,7 @@ from pathlib import Path from PIL import Image +import pytesseract import pypdfium2 as pdfium from prodigy import recipe, set_hashes, ControllerComponentsDict @@ -123,7 +124,7 @@ def fold_ocr_dashes(ocr_input:str) -> str: dataset=("Dataset to save answers to", "positional", None, str), source=("Source with PDF Annotations", "positional", None, str), labels=("Labels to consider", "option", "l", str), - scale=("Zoom for higher resolution for OCR algorithm", "option", "s", int), + scale=("Zoom scale. Increase above 3 to upscale the image for OCR.", "option", "s", int), remove_base64=("Remove base64-encoded image data", "flag", "R", bool), fold_dashes=("Removes dashes at the end of a textline and folds them with the next term.", "flag", "f", bool), autofocus=("Autofocus on the transcript UI", "flag", "af", bool) @@ -139,8 +140,6 @@ def pdf_ocr_correct( autofocus: bool = False ) -> ControllerComponentsDict: """Applies OCR to annotated segments and gives a textbox for corrections.""" - import pytesseract - stream = get_stream(source) labels = labels.split(",") @@ -148,6 +147,12 @@ def new_stream(stream): for ex in stream: useful_spans = [span for span in ex['spans'] if span['label'] in labels] if useful_spans: + if 'meta' not in ex: + raise ValueError(f"It seems the `meta` key is missing from an example: {ex}. Did you annotate this data with `pdf.image.manual`?") + if 'path' not in ex['meta']: + raise ValueError(f"It seems the `path` key is missing from an example metadata: {ex}. Did you annotate this data with `pdf.image.manual`?") + if 'page' not in ex['meta']: + raise ValueError(f"It seems the `page` key is missing from an example metadata: {ex}. Did you annotate this data with `pdf.image.manual`?") pdf = pdfium.PdfDocument(ex['meta']['path']) page = pdf.get_page(ex['meta']['page']) pil_page = page.render(scale=scale).to_pil() diff --git a/setup.cfg b/setup.cfg index c9899d7..97f4922 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,6 +11,7 @@ python_requires = >=3.8 install_requires = pypdfium2==4.20.0 Pillow==9.4.0 + pytesseract==0.3.10 [options.entry_points] prodigy_recipes = From 75ac2285615db15efe0917af50400256eb674355 Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 18 Oct 2023 12:45:45 +0200 Subject: [PATCH 11/14] add version number --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 97f4922..8601795 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 0.0.2 +version = 0.2.0 description = Recipes for PDF annotation url = https://github.com/explosion/prodigy-pdf author = Explosion From 8ed6147bb7bb89cc027301897767e102b85f5940 Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 18 Oct 2023 12:55:40 +0200 Subject: [PATCH 12/14] add helper --- prodigy_pdf/__init__.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 0368785..bf2de75 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -118,6 +118,15 @@ def fold_ocr_dashes(ocr_input:str) -> str: return new.strip() +def _validate_ocr_example(ex: Dict): + if 'meta' not in ex: + raise ValueError(f"It seems the `meta` key is missing from an example: {ex}. Did you annotate this data with `pdf.image.manual`?") + if 'path' not in ex['meta']: + raise ValueError(f"It seems the `path` key is missing from an example metadata: {ex}. Did you annotate this data with `pdf.image.manual`?") + if 'page' not in ex['meta']: + raise ValueError(f"It seems the `page` key is missing from an example metadata: {ex}. Did you annotate this data with `pdf.image.manual`?") + + @recipe( "pdf.ocr.correct", # fmt: off @@ -147,12 +156,7 @@ def new_stream(stream): for ex in stream: useful_spans = [span for span in ex['spans'] if span['label'] in labels] if useful_spans: - if 'meta' not in ex: - raise ValueError(f"It seems the `meta` key is missing from an example: {ex}. Did you annotate this data with `pdf.image.manual`?") - if 'path' not in ex['meta']: - raise ValueError(f"It seems the `path` key is missing from an example metadata: {ex}. Did you annotate this data with `pdf.image.manual`?") - if 'page' not in ex['meta']: - raise ValueError(f"It seems the `page` key is missing from an example metadata: {ex}. Did you annotate this data with `pdf.image.manual`?") + _validate_ocr_example(ex) pdf = pdfium.PdfDocument(ex['meta']['path']) page = pdf.get_page(ex['meta']['page']) pil_page = page.render(scale=scale).to_pil() From afe92b34fd53d614110c9c07538c9033cd846c5b Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 18 Oct 2023 14:14:01 +0200 Subject: [PATCH 13/14] readme and better color --- README.md | 10 ++++++++++ prodigy_pdf/__init__.py | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fea6567..2bddf2a 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,16 @@ You can install this plugin via `pip`. pip install "prodigy-pdf @ git+https://github.com/explosion/prodigy-pdf" ``` +If you want to use the OCR recipes, you'll also want to ensure that tesseract is installed. + +```bash +# for mac +brew install tesseract + +# for ubuntu +sudo apt install tesseract-ocr +``` + To learn more about this plugin, you can check the [Prodigy docs](https://prodi.gy/docs/plugins/#pdf). ## Issues? diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index bf2de75..1781c12 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -69,9 +69,9 @@ def before_db(examples): del eg["image"] return examples - color = ["#ffff00", "#00ffff", "#ff00ff", "#00ff7f", "#ff6347", "#00bfff", + color = ["#00ffff", "#ff00ff", "#00ff7f", "#ff6347", "#00bfff", "#ffa500", "#ff69b4", "#7fffd4", "#ffd700", "#ffdab9", "#adff2f", - "#d2b48c", "#dcdcdc"] + "#d2b48c", "#dcdcdc", "#ffff00", ] return { "dataset": dataset, @@ -80,7 +80,7 @@ def before_db(examples): "view_id": "image_manual", "config": { "labels": labels.split(","), - "image_manual_stroke_width": 1, + "image_manual_stroke_width": 2, "custom_theme": { "labels": { lab: color[i] for i, lab in enumerate(labels.split(",")) From 027daedd05418af0985d0cf133643f8dfa778dc7 Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 19 Oct 2023 11:04:53 +0200 Subject: [PATCH 14/14] fix some nits --- prodigy_pdf/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py index 71ba3de..ab66c7d 100644 --- a/prodigy_pdf/__init__.py +++ b/prodigy_pdf/__init__.py @@ -8,8 +8,8 @@ import pypdfium2 as pdfium from prodigy import recipe, set_hashes, ControllerComponentsDict -from prodigy.components.stream import Stream -from prodigy.util import msg +from prodigy.components.stream import Stream, get_stream +from prodigy.util import msg, split_string def page_to_image(page: pdfium.PdfPage) -> str: @@ -132,7 +132,7 @@ def _validate_ocr_example(ex: Dict): # fmt: off dataset=("Dataset to save answers to", "positional", None, str), source=("Source with PDF Annotations", "positional", None, str), - labels=("Labels to consider", "option", "l", str), + labels=("Labels to consider", "option", "l", split_string), scale=("Zoom scale. Increase above 3 to upscale the image for OCR.", "option", "s", int), remove_base64=("Remove base64-encoded image data", "flag", "R", bool), fold_dashes=("Removes dashes at the end of a textline and folds them with the next term.", "flag", "f", bool), @@ -150,11 +150,10 @@ def pdf_ocr_correct( ) -> ControllerComponentsDict: """Applies OCR to annotated segments and gives a textbox for corrections.""" stream = get_stream(source) - labels = labels.split(",") def new_stream(stream): for ex in stream: - useful_spans = [span for span in ex['spans'] if span['label'] in labels] + useful_spans = [span for span in ex.get('spans', []) if span['label'] in labels] if useful_spans: _validate_ocr_example(ex) pdf = pdfium.PdfDocument(ex['meta']['path'])