diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml index 402a2642a..737234bdf 100644 --- a/data-connector-lib/pyproject.toml +++ b/data-connector-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_connector" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10" keywords = [ "data", diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py index f024e63b1..491806398 100644 --- a/data-connector-lib/src/dpk_connector/core/crawler.py +++ b/data-connector-lib/src/dpk_connector/core/crawler.py @@ -74,6 +74,7 @@ def async_crawl( user_agent: str = "", headers: dict[str, str] = {}, allow_domains: Collection[str] = (), + subdomain_focus: bool = False, path_focus: bool = False, allow_mime_types: Collection[str] = ( "application/pdf", @@ -96,6 +97,7 @@ def async_crawl( user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)". headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary. allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs. + subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified. path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted. allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain". disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection. @@ -140,6 +142,7 @@ def async_crawl( seed_urls=seed_urls, callback=on_downloaded, allow_domains=allow_domains, + subdomain_focus=subdomain_focus, path_focus=path_focus, allow_mime_types=allow_mime_types, disallow_mime_types=disallow_mime_types, @@ -155,6 +158,7 @@ def crawl( user_agent: str = "", headers: dict[str, str] = {}, allow_domains: Collection[str] = (), + subdomain_focus: bool = False, path_focus: bool = False, allow_mime_types: Collection[str] = ( "application/pdf", @@ -177,6 +181,7 @@ def crawl( user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)". headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary. allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs. + subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified. path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted. allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain". disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection. @@ -198,6 +203,7 @@ def on_completed(result: Any): user_agent, headers, allow_domains, + subdomain_focus, path_focus, allow_mime_types, disallow_mime_types, diff --git a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py index f24d4088b..de18ab596 100644 --- a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py +++ b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py @@ -28,6 +28,7 @@ get_content_type, get_etld1, get_focus_path, + get_fqdn, is_allowed_path, urlparse_cached, ) @@ -42,6 +43,7 @@ def __init__( self, seed_urls: Collection[str], allow_domains: Collection[str] = (), + subdomain_focus: bool = False, path_focus: bool = False, allow_mime_types: Collection[str] = (), disallow_mime_types: Collection[str] = (), @@ -88,11 +90,15 @@ def __init__( self.focus_paths.add(path) # Domains and mime types filtering - self.allowed_domains = set( - allow_domains - if len(allow_domains) > 0 - else [get_etld1(url) for url in seed_urls] - ) + if allow_domains: + self.allowed_domains = set(allow_domains) + elif subdomain_focus: + self.allowed_domains = set() + for url in seed_urls: + if fqdn := get_fqdn(url): + self.allowed_domains.add(fqdn) + else: + self.allowed_domains = set(get_etld1(url) for url in seed_urls) self.allow_mime_types = set( [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else () ) @@ -155,7 +161,9 @@ def start_requests(self): ) def _parse_sitemap(self, response: Response): - yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True) + yield ConnectorItem( + dropped=False, downloaded=False, system_request=True, sitemap=True + ) seed_url = response.meta["seed_url"] diff --git a/data-connector-lib/src/dpk_connector/core/utils.py b/data-connector-lib/src/dpk_connector/core/utils.py index d2dfa760d..50a9c9981 100644 --- a/data-connector-lib/src/dpk_connector/core/utils.py +++ b/data-connector-lib/src/dpk_connector/core/utils.py @@ -57,6 +57,11 @@ def get_etld1(url: str) -> str: return f"{ext.domain}.{ext.suffix}" +def get_fqdn(url: str) -> str: + ext = tldextract.extract(url) + return ext.fqdn + + def get_focus_path(url: str) -> str | None: parts = urlparse_cached(url) if len(parts.path.split("/")) > 2: diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py index a93785a82..337e67791 100644 --- a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py +++ b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py @@ -33,6 +33,21 @@ def crawler() -> Crawler: return crawler +def test_init_subdomain_focus(): + spider = BaseSitemapSpider( + seed_urls=( + "http://blog.example.com/", + "http://contents.example.com/", + ), + subdomain_focus=True, + ) + assert spider.seed_urls == { + "http://blog.example.com/", + "http://contents.example.com/", + } + assert spider.allowed_domains == {"blog.example.com", "contents.example.com"} + + def test_init_path_focus(): spider = BaseSitemapSpider( seed_urls=( diff --git a/data-connector-lib/test/dpk_connector/core/test_utils.py b/data-connector-lib/test/dpk_connector/core/test_utils.py index 7043084d3..009b37f98 100644 --- a/data-connector-lib/test/dpk_connector/core/test_utils.py +++ b/data-connector-lib/test/dpk_connector/core/test_utils.py @@ -19,6 +19,7 @@ get_content_type, get_etld1, get_focus_path, + get_fqdn, get_header_value, get_mime_type, is_allowed_path, @@ -93,6 +94,21 @@ def test_get_etld1(url: str, expected: str): assert get_etld1(url) == expected +@pytest.mark.parametrize( + "url,expected", + [ + ("http://www.example.com", "www.example.com"), + ("https://www.example.co.uk", "www.example.co.uk"), + ("http://www.example.com/path?query=string#fragment", "www.example.com"), + ("http://localhost:8080/", ""), + ("http://www.example.com:8080/", "www.example.com"), + ("http://www.sub.example.com:8080/", "www.sub.example.com"), + ], +) +def test_get_fqdn(url: str, expected: str): + assert get_fqdn(url) == expected + + @pytest.mark.parametrize( "url,expected", [ diff --git a/examples/notebooks/intro/.gitignore b/examples/notebooks/intro/.gitignore new file mode 100644 index 000000000..89b9e565b --- /dev/null +++ b/examples/notebooks/intro/.gitignore @@ -0,0 +1,10 @@ +output*/ + +## File system artifacts +.directory +.DS_Store + + +## Python output +__pycache__ +.ipynb_checkpoints/ \ No newline at end of file diff --git a/examples/notebooks/intro/README.md b/examples/notebooks/intro/README.md new file mode 100644 index 000000000..4a45cbbad --- /dev/null +++ b/examples/notebooks/intro/README.md @@ -0,0 +1,36 @@ +# Data Prep Kit Introduction + +This is an example featuring some of the features of data prep kit. + +## Running the code + +The code can be run on either + +1. Google colab: very easy to run; no local setup needed. +2. On your local Python environment. Here is a quick guide. You can find instructions for latest version [here](../../../README.md#-getting-started) + +```bash +conda create -n data-prep-kit -y python=3.11 +conda activate data-prep-kit + +# install the following in 'data-prep-kit' environment +pip3 install data-prep-tooklit==0.2.1 +pip3 install data-prep-toolkit-transforms==0.2.1 +pip3 install data-prep-toolkit-transforms-ray==0.2.1 +pip3 install jupyterlab ipykernel ipywidgets + +## install custom kernel +## Important: Use this kernel when running example notebooks! +python -m ipykernel install --user --name=data-prep-kit --display-name "dataprepkit" + +# start jupyter and run the notebooks with this jupyter +jupyter lab +``` + +## Intro + +This notebook will demonstrate processing PDFs + +`PDFs ---> text ---> chunks ---> exact dedupe ---> fuzzy dedupe ---> embeddings` + +[python version](dpk_intro_1_python.ipynb)   |   [ray version](dpk_intro_1_ray.ipynb) diff --git a/examples/notebooks/intro/dpk_intro_1_python.ipynb b/examples/notebooks/intro/dpk_intro_1_python.ipynb new file mode 100644 index 000000000..f3659afcf --- /dev/null +++ b/examples/notebooks/intro/dpk_intro_1_python.ipynb @@ -0,0 +1,3667 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", + "metadata": { + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" + }, + "source": [ + "# Data Prep Kit Demo 1 - Python version\n", + "\n", + "This notebook will introduce DPK and showcase some of it's capabilities.\n", + "\n", + "Here is the workflow\n", + "\n", + "![](https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b15976e3", + "metadata": { + "id": "b15976e3" + }, + "source": [ + "## How to run this notebook\n", + "\n", + "Two options:\n", + "\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n", + "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", + "\n", + "The notebook will work as in both environments" + ] + }, + { + "cell_type": "markdown", + "id": "eb8b0d5c", + "metadata": { + "id": "eb8b0d5c" + }, + "source": [ + "## Step-1: Inspect the Data\n", + "\n", + "We will use simple PDFs about Solar system. The files are [here](https://github.com/sujee/data-prep-kit/tree/intro-example1/examples/notebooks/intro/input/solar-system)\n", + "\n", + "- [earth.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf)\n", + "- [mars.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf)\n" + ] + }, + { + "cell_type": "markdown", + "id": "39a0ab6e", + "metadata": { + "id": "39a0ab6e" + }, + "source": [ + "## Step-2: Figure out Runtime Environment\n", + "\n", + "### 2.1 - Determine runtime\n", + "\n", + "Determine if we are running on Google colab or local python environment" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1fe354b7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1fe354b7", + "outputId": "5c153f72-08ed-4d6e-ccc7-dae851e7fd8b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "8e7c104b", + "metadata": { + "id": "8e7c104b" + }, + "source": [ + "### 2.2 -Download Data if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3309799e", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3309799e", + "outputId": "99530315-6dd5-405d-dbde-61e2332e441b" + }, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " !mkdir -p 'input/solar-system'\n", + " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf'\n", + " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf'\n", + " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/my_utils.py'" + ] + }, + { + "cell_type": "markdown", + "id": "a5dc2b68", + "metadata": { + "id": "a5dc2b68" + }, + "source": [ + "### 2.3 - Install dependencies if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1fcec577", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "1fcec577", + "outputId": "0f77fc39-ffeb-48da-ce6f-1750d8d3ad62" + }, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " ! pip install --default-timeout=100 \\\n", + " data-prep-toolkit==0.2.1 \\\n", + " data-prep-toolkit-transforms==0.2.1 \\\n", + " deepsearch-toolkit\n" + ] + }, + { + "cell_type": "markdown", + "id": "243322b8", + "metadata": { + "id": "243322b8" + }, + "source": [ + "### 2.4 - Restart Runtime\n", + "\n", + "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "e8b10be1", + "metadata": { + "id": "e8b10be1" + }, + "source": [ + "## Step-2: Configuration" + ] + }, + { + "cell_type": "markdown", + "id": "356c66f7", + "metadata": { + "id": "356c66f7" + }, + "source": [ + "### 2.1 - Basic Config" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e4YMZrBuFycl", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e4YMZrBuFycl", + "outputId": "d7ee9449-4f21-4c9a-fa54-14b7f28d764a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "33345487", + "metadata": { + "id": "33345487" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "## Configuration\n", + "class MyConfig:\n", + " pass\n", + "\n", + "MY_CONFIG = MyConfig ()\n", + "\n", + "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", + "\n", + "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", + "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", + "\n", + "## Embedding model\n", + "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b15e6827", + "metadata": { + "id": "b15e6827" + }, + "outputs": [], + "source": [ + "## Add parent dir to path\n", + "import os,sys\n", + "\n", + "this_dir = os.path.abspath('')\n", + "parent_dir = os.path.dirname(this_dir)\n", + "sys.path.append (os.path.abspath (parent_dir))" + ] + }, + { + "cell_type": "markdown", + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", + "metadata": { + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" + }, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "outputId": "4d5511fb-1c6f-47df-e5ea-2c1b354d262f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Cleared output directory\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", + " raise Exception (f\"โŒ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", + "\n", + "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", + "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", + "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", + "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_embeddings_out')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", + "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", + "\n", + "print (\"โœ… Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", + "metadata": { + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" + }, + "source": [ + "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", + "\n", + "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", + "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", + "metadata": { + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" + }, + "source": [ + "### 3.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "outputId": "c50847d4-f2c7-4559-f5f7-d6a3d025027d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" + ] + } + ], + "source": [ + "STAGE = 1\n", + "\n", + "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", + "output_folder = output_parquet_dir\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", + "metadata": { + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" + }, + "source": [ + "### 3.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 657, + "referenced_widgets": [ + "97b603697cfa4b4ea4e6735b6768ca35", + "e87e8d3262c54cfaaa8768505edacda3", + "b78aa40816e44f7fbebcb24ca68818b3", + "7053c9606a414e978636a7e241909504", + "da0787b239764847a731083997780a85", + "553f3c16839a49d79591d0fc4862bed6", + "c0eb5bc8f6ee427ca42204b3c56f9a4e", + "9d184ed175f0403fb03c2e13dfd04e0a", + "724778729161445c98b187031ae4f67c", + "1cb3bbf7d724411cbe9831543a4aecc0", + "06f9b33494984e4885d5aad813d1d2bc" + ] + }, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "outputId": "01d207fb-983d-40b2-e5f6-e38e3789110a" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:34:39 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", + "13:34:39 INFO - pipeline id pipeline_id\n", + "13:34:39 INFO - code location None\n", + "13:34:39 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", + "13:34:39 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:34:39 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "13:34:39 INFO - orchestrator pdf2parquet started at 2024-10-18 13:34:39\n", + "13:34:39 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", + "13:34:39 INFO - Initializing models\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "750f3b6951094b2eb68490c7f5f98148", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 10 files: 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...10116e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011efbdbcb9-f0af-42f0-b191-2f14ce3ddc7cpdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdf
\n", + "" + ], + "text/plain": [ + " filename contents num_pages \\\n", + "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + "\n", + " num_tables num_doc_elements document_id ext \\\n", + "0 0 11 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 pdf \n", + "1 0 11 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "1 2024-10-18T13:34:43.410297 0.794765 earth.pdf " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(5)\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "e5058a21", + "metadata": { + "id": "e5058a21" + }, + "source": [ + "\n", + "### 3.4 - Understand the output\n", + "\n", + "Here are some interesting attributes to note:\n", + "\n", + "- **filename** : original filename\n", + "- **contents** : text\n", + "- **document_id**: unique id (UUID) assignd to this document\n", + "- **hash** : hash of document\n", + "- **pdf_convert_time** : time to convert this pdf in seconds\n", + "\n", + "Let's inspect the **contents** column. See how the text is being divided up!" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f870e624", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f870e624", + "outputId": "0b4c054f-3a8a-4db3-f32f-17bd1466b102" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_name': '',\n", + " 'description': {'logs': []},\n", + " 'equations': [],\n", + " 'figures': [],\n", + " 'file-info': {'#-pages': 1,\n", + " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", + " 'filename': 'mars.pdf',\n", + " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", + " 'model': 'default',\n", + " 'page': 1}]},\n", + " 'footnotes': [],\n", + " 'main-text': [{'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.35137939,\n", + " 654.45184326,\n", + " 169.88169861,\n", + " 667.98492432],\n", + " 'page': 1,\n", + " 'span': [0, 4]}],\n", + " 'text': 'Mars',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.09541321,\n", + " 630.68127441,\n", + " 210.66503906,\n", + " 642.34405518],\n", + " 'page': 1,\n", + " 'span': [0, 12]}],\n", + " 'text': 'Solar System',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.84518433,\n", + " 588.96014404,\n", + " 479.40917969,\n", + " 623.02520752],\n", + " 'page': 1,\n", + " 'span': [0, 205]}],\n", + " 'text': 'Our solar system is a vast and fascinating expanse, '\n", + " 'comprising eight planets, five dwarf planets, '\n", + " 'numerous moons, asteroids, comets, and other '\n", + " 'celestial bodies. At its center lies the star we call '\n", + " 'the Sun.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.18510437,\n", + " 570.83258057,\n", + " 374.99838257,\n", + " 581.07043457],\n", + " 'page': 1,\n", + " 'span': [0, 54]}],\n", + " 'text': 'For more details about the Solar system see Chapter '\n", + " '1.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.22866821,\n", + " 542.98168945,\n", + " 163.86282349,\n", + " 554.45288086],\n", + " 'page': 1,\n", + " 'span': [0, 4]}],\n", + " 'text': 'Mars',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.87440491,\n", + " 500.84011841,\n", + " 477.48345947,\n", + " 534.55810547],\n", + " 'page': 1,\n", + " 'span': [0, 196]}],\n", + " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", + " 'desert world with a thin atmosphere composed '\n", + " 'primarily of carbon dioxide. Its reddish hue comes '\n", + " 'from iron oxide, or rust, prevalent on its surface.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.2026062,\n", + " 482.90710449,\n", + " 237.04431152,\n", + " 493.07443237],\n", + " 'page': 1,\n", + " 'span': [0, 23]}],\n", + " 'text': 'Basic facts about Mars:',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 453.019104,\n", + " 477.48171997,\n", + " 474.9703064],\n", + " 'page': 1,\n", + " 'span': [0, 78]}],\n", + " 'text': 'ยท Distance from the Sun: Average of 228 million '\n", + " 'kilometers (142 million miles)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 440.79351807,\n", + " 431.73287964,\n", + " 451.2142334],\n", + " 'page': 1,\n", + " 'span': [0, 64]}],\n", + " 'text': 'ยท Rotation Period: 24.6 hours (one Martian day - '\n", + " 'called a \"sol\")',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 429.10913086,\n", + " 365.9559021,\n", + " 438.83737183],\n", + " 'page': 1,\n", + " 'span': [0, 44]}],\n", + " 'text': 'ยท Moons: Two small moons, Phobos and Deimos.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Page-footer',\n", + " 'prov': [{'bbox': [303.13299561,\n", + " 87.20314026,\n", + " 308.11428833,\n", + " 96.51646423],\n", + " 'page': 1,\n", + " 'span': [0, 1]}],\n", + " 'text': '1',\n", + " 'type': 'page-footer'}],\n", + " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", + " 'page-footers': [],\n", + " 'page-headers': [],\n", + " 'tables': [],\n", + " 'type': 'pdf-document'}\n" + ] + } + ], + "source": [ + "import pprint\n", + "import json\n", + "\n", + "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", + "# json.loads(output_df.iloc[0, ]['contents'])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e1a10c2d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e1a10c2d", + "outputId": "c1d992c2-faa8-40cd-c375-857970201daa" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_name': '',\n", + " 'description': {'logs': []},\n", + " 'equations': [],\n", + " 'figures': [],\n", + " 'file-info': {'#-pages': 1,\n", + " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", + " 'filename': 'earth.pdf',\n", + " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", + " 'model': 'default',\n", + " 'page': 1}]},\n", + " 'footnotes': [],\n", + " 'main-text': [{'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.30961609,\n", + " 654.45184326,\n", + " 174.04208374,\n", + " 667.93347168],\n", + " 'page': 1,\n", + " 'span': [0, 5]}],\n", + " 'text': 'Earth',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.12528992,\n", + " 630.69073486,\n", + " 210.66503906,\n", + " 642.27935791],\n", + " 'page': 1,\n", + " 'span': [0, 12]}],\n", + " 'text': 'Solar System',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.87112427,\n", + " 588.96014404,\n", + " 479.40917969,\n", + " 623.04595947],\n", + " 'page': 1,\n", + " 'span': [0, 205]}],\n", + " 'text': 'Our solar system is a vast and fascinating expanse, '\n", + " 'comprising eight planets, five dwarf planets, '\n", + " 'numerous moons, asteroids, comets, and other '\n", + " 'celestial bodies. At its center lies the star we call '\n", + " 'the Sun.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.20942688,\n", + " 570.81555176,\n", + " 375.57919312,\n", + " 581.08459473],\n", + " 'page': 1,\n", + " 'span': [0, 54]}],\n", + " 'text': 'For more details about our Solar system see Chapter '\n", + " '1.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.15542603,\n", + " 542.98168945,\n", + " 167.32983398,\n", + " 554.36669922],\n", + " 'page': 1,\n", + " 'span': [0, 5]}],\n", + " 'text': 'Earth',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.91053772,\n", + " 512.46295166,\n", + " 477.84887695,\n", + " 534.48431396],\n", + " 'page': 1,\n", + " 'span': [0, 107]}],\n", + " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", + " 'planet. Earth is the only place we know of with life.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.30151367,\n", + " 494.86206055,\n", + " 240.17156982,\n", + " 505.07229614],\n", + " 'page': 1,\n", + " 'span': [0, 24]}],\n", + " 'text': 'Basic facts about Earth:',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 464.97409058,\n", + " 477.47979736,\n", + " 487.02810669],\n", + " 'page': 1,\n", + " 'span': [0, 79]}],\n", + " 'text': 'ยท Distance from the Sun: Average of 149.6 million '\n", + " 'kilometers (93 million miles)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 452.86901855,\n", + " 317.90722656,\n", + " 463.24041748],\n", + " 'page': 1,\n", + " 'span': [0, 37]}],\n", + " 'text': 'ยท Rotation Period: 24 hours (one day)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 440.71496582,\n", + " 396.66357422,\n", + " 451.19915771],\n", + " 'page': 1,\n", + " 'span': [0, 52]}],\n", + " 'text': 'ยท Moons: One moon, called Luna or simply \"the Moon\".',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Page-footer',\n", + " 'prov': [{'bbox': [303.13299561,\n", + " 87.20314026,\n", + " 308.11428833,\n", + " 96.53633118],\n", + " 'page': 1,\n", + " 'span': [0, 1]}],\n", + " 'text': '1',\n", + " 'type': 'page-footer'}],\n", + " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", + " 'page-footers': [],\n", + " 'page-headers': [],\n", + " 'tables': [],\n", + " 'type': 'pdf-document'}\n" + ] + } + ], + "source": [ + "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" + ] + }, + { + "cell_type": "markdown", + "id": "72274586", + "metadata": { + "id": "72274586" + }, + "source": [ + "## Step-4: Doc chunks\n", + "\n", + "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", + "\n", + "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", + "\n", + "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", + "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", + "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", + "which provides the required JSON structure." + ] + }, + { + "cell_type": "markdown", + "id": "96198fa6", + "metadata": { + "id": "96198fa6" + }, + "source": [ + "### 4.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "305f00a3", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "305f00a3", + "outputId": "dd511f34-bab3-4dde-d938-493debb02e5e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" + ] + } + ], + "source": [ + "STAGE = 2\n", + "\n", + "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_chunk_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "369f2cd1", + "metadata": { + "id": "369f2cd1" + }, + "source": [ + "### 4.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5b7b18d5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5b7b18d5", + "outputId": "e0b87171-9d66-473f-e66a-e4b6ae3c3f66" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:34:45 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", + "13:34:45 INFO - pipeline id pipeline_id\n", + "13:34:45 INFO - code location None\n", + "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", + "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:34:45 INFO - orchestrator doc_chunk started at 2024-10-18 13:34:45\n", + "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", + "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", + "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:34:45 INFO - done flushing in 0.0 sec\n", + "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:2 completed successfully\n", + "CPU times: user 826 ms, sys: 101 ms, total: 928 ms\n", + "Wall time: 923 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from doc_chunk_transform_python import DocChunkPythonTransformConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # doc_chunk arguments\n", + " # ...\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(DocChunkPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "213afdf6", + "metadata": { + "id": "213afdf6" + }, + "source": [ + "### 4.3 - Inspect Generated output\n", + "\n", + "We would see documents are split into many chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d8138d43", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 897 + }, + "id": "d8138d43", + "outputId": "fd01e0cb-899e-4c73-d50e-5f4e6f5ff802" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Files processed : 2\n", + "Chunks created : 8\n", + "Input data dimensions (rows x columns)= (2, 12)\n", + "Output data dimensions (rows x columns)= (8, 16)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 mars.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "7 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "3 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "7 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "3 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "7 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (f\"Files processed : {input_df.shape[0]:,}\")\n", + "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "9e9ca75c", + "metadata": { + "id": "9e9ca75c" + }, + "source": [ + "### 4.4 - Understanding the Output\n", + "\n", + "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", + "\n", + "See how **document_id** is carried throughout. This helps us identify original documents.\n", + "\n", + "Also note **contents** is now plain text (not JSON as before)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3090c950", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "3090c950", + "outputId": "0f4b6771-8d38-4a27-c756-21f916b23a4f" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\nยท Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" + ], + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", + "1 mars.pdf Solar System\\nFor more details about the Solar...\n", + "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "3 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", + "5 earth.pdf Solar System\\nFor more details about our Solar...\n", + "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "7 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d5f151ae", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d5f151ae", + "outputId": "a4c491b2-53db-4d71-da24-4479de8d1d65" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about the Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 3------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 3------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "7ad1c60d", + "metadata": { + "id": "7ad1c60d" + }, + "source": [ + "## Step-5: DOC ID generation of Chunks\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "\n", + "**This is a pre-requisite for fuzzy dedup** in the pipeline." + ] + }, + { + "cell_type": "markdown", + "id": "1afaa0fd", + "metadata": { + "id": "1afaa0fd" + }, + "source": [ + "### 5.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6ffd6f54", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6ffd6f54", + "outputId": "1784c80d-6309-4913-9f55-c018b978968f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" + ] + } + ], + "source": [ + "\n", + "# Input for this stage is the output of exact dedeup component\n", + "# output of this component makes it possible for fdedup component to run on data.\n", + "\n", + "STAGE = 3\n", + "\n", + "input_folder = output_chunk_dir\n", + "output_folder = output_docid_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "f78a51b7", + "metadata": { + "id": "f78a51b7" + }, + "source": [ + "### 5.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5fc77557", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5fc77557", + "outputId": "db2b8670-543e-4073-9c7d-3f9ef5f4317e" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:34:45 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", + "13:34:45 INFO - pipeline id pipeline_id\n", + "13:34:45 INFO - code location None\n", + "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", + "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:34:45 INFO - orchestrator doc_id started at 2024-10-18 13:34:45\n", + "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", + "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", + "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:34:45 INFO - done flushing in 0.0 sec\n", + "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:3 completed successfully\n", + "CPU times: user 12.8 ms, sys: 3.7 ms, total: 16.5 ms\n", + "Wall time: 13.1 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " # doc id configuration\n", + " \"doc_id_doc_column\": \"contents\",\n", + " \"doc_id_hash_column\": \"chunk_hash\",\n", + " \"doc_id_int_column\": \"chunk_id\",\n", + "}\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# launch\n", + "\n", + "launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "a9a8c1fa", + "metadata": { + "id": "a9a8c1fa" + }, + "source": [ + "### 5.3 - Inspect Generated output\n", + "\n", + "You will notice we have two extra columns\n", + "\n", + "- **hash_column**\n", + "- **int_id_column**\n", + "\n", + "But still the same number or rows as before" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "da9adede", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 860 + }, + "id": "da9adede", + "outputId": "036db4ca-12f6-4b3e-9d7f-fa70e494870d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 16)\n", + "Output data dimensions (rows x columns)= (8, 18)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 mars.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "7 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "3 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "7 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "3 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "7 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53", + "metadata": { + "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53" + }, + "source": [ + "## Step-6: Exact Dedup\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", + "metadata": { + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" + }, + "source": [ + "### 6.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4c7a1b94", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4c7a1b94", + "outputId": "2f6f05bc-f6fd-4d66-ea01-ed89cd5b80f3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" + ] + } + ], + "source": [ + "STAGE = 4\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_exact_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", + "metadata": { + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" + }, + "source": [ + "### 6.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "outputId": "74dc0b75-58b5-4c97-9965-91315e8a98a5" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:34:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "13:34:45 INFO - pipeline id pipeline_id\n", + "13:34:45 INFO - code location None\n", + "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", + "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:34:45 INFO - orchestrator ededup started at 2024-10-18 13:34:45\n", + "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", + "13:34:45 INFO - Starting from the beginning\n", + "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", + "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:34:45 INFO - done flushing in 0.0 sec\n", + "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:4 completed successfully\n", + "CPU times: user 17.6 ms, sys: 997 ฮผs, total: 18.6 ms\n", + "Wall time: 15.2 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # ededup parameters\n", + " \"ededup_doc_column\": \"contents\",\n", + " \"ededup_doc_id_column\": \"chunk_hash\",\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(EdedupPythonTransformRuntimeConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "eaf1c3c3", + "metadata": { + "id": "eaf1c3c3" + }, + "source": [ + "### 6.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d824ebf6", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 815 + }, + "id": "d824ebf6", + "outputId": "68f55770-c750-4607-a205-ba183603019d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 18)\n", + "Output data dimensions (rows x columns)= (7, 19)\n", + "Input chunks before exact dedupe : 8\n", + "Output chunks after exact dedupe : 7\n", + "Duplicate chunks removed : 1\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "3 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "3 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", + "\n", + " removed \n", + "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + "5 [] \n", + "6 [] " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "82cc9bb0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "id": "82cc9bb0", + "outputId": "46d9e91d-c470-4e3e-e5c8-508c534dbceb" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" + ], + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nFor more details about the Solar...\n", + "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", + "4 earth.pdf Solar System\\nFor more details about our Solar...\n", + "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "6 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "cc61dffa", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cc61dffa", + "outputId": "7fb26043-8538-48b6-80b7-16ceb818c1a8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "For more details about the Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 1------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 2------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 3------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "383f40ba", + "metadata": { + "id": "383f40ba" + }, + "source": [ + "### 6.4 - Understanding the output\n", + "\n", + "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", + "\n", + "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", + "\n", + "```text\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "85309751-8556-41c6-ac32-84acc941bc8d", + "metadata": { + "id": "85309751-8556-41c6-ac32-84acc941bc8d" + }, + "source": [ + " ## Step-7: Fuzzy Dedup\n", + "\n", + "And fuzzy dedupe is only available in RAY version. So we will skip it here\n", + "\n", + "See this file [dpk_intro_1_ray.ipynb](dpk_intro_1_ray.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "5370950a-2a3a-4143-8218-f9b4808099ba", + "metadata": { + "id": "5370950a-2a3a-4143-8218-f9b4808099ba" + }, + "source": [ + "## Step-8: Text encoding\n", + "\n", + "Encode text for the vector storage." + ] + }, + { + "cell_type": "markdown", + "id": "85aba685", + "metadata": { + "id": "85aba685" + }, + "source": [ + "### 8.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "outputId": "41d268f5-7cc6-432e-d56e-2ba882fbdba6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-6: Processing input='output/04_exact_dedupe_out' --> output='output/05_embeddings_out'\n" + ] + } + ], + "source": [ + "STAGE = 6\n", + "\n", + "input_folder = output_exact_dedupe_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_embeddings_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "c97545f4", + "metadata": { + "id": "c97545f4" + }, + "source": [ + "### 8.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "outputId": "b2119b07-0654-45cd-f729-1396e18b24b1" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:34:45 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", + "13:34:45 INFO - pipeline id pipeline_id\n", + "13:34:45 INFO - code location None\n", + "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/04_exact_dedupe_out output_folder - output/05_embeddings_out\n", + "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:34:45 INFO - orchestrator text_encoder started at 2024-10-18 13:34:45\n", + "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.010450363159179688, 'min_file_size': 0.010318756103515625, 'total_file_size': 0.020769119262695312}\n", + "13:34:47 INFO - Completed 1 files (50.0%) in 0.004 min\n", + "13:34:47 INFO - Completed 2 files (100.0%) in 0.005 min\n", + "13:34:47 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:34:47 INFO - done flushing in 0.0 sec\n", + "13:34:47 INFO - Completed execution in 0.034 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:6 completed successfully\n", + "CPU times: user 615 ms, sys: 146 ms, total: 761 ms\n", + "Wall time: 2.24 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from text_encoder_local_python import TextEncoderPythonTransformConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # text_encoder\n", + " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", + "}\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(TextEncoderPythonTransformConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "b734852c", + "metadata": { + "id": "b734852c" + }, + "source": [ + "### 8.3 - Inspect Generated output\n", + "\n", + "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "7b1c1d09", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 760 + }, + "id": "7b1c1d09", + "outputId": "018daa18-e5db-4483-d8d5-30aded80d5e3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (7, 19)\n", + "Output data dimensions (rows x columns)= (7, 20)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremovedembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...[-0.051861435, 0.0035226212, 0.030617002, 0.04...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[][0.07728295, 0.024970993, -0.043180738, 0.0580...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[][0.10598018, 0.025460618, 0.023627337, 0.03905...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[][0.0077404436, -0.02055944, 0.026426593, 0.011...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[][-0.062105548, -0.0053322907, 0.031277698, 0.0...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[][0.072435796, -0.058001805, -0.019771898, -0.0...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[][0.091821924, 0.015197902, 0.07716932, 0.01711...
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", + "3 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", + "3 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", + "\n", + " removed \\\n", + "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + "5 [] \n", + "6 [] \n", + "\n", + " embeddings \n", + "0 [-0.051861435, 0.0035226212, 0.030617002, 0.04... \n", + "1 [0.07728295, 0.024970993, -0.043180738, 0.0580... \n", + "2 [0.10598018, 0.025460618, 0.023627337, 0.03905... \n", + "3 [0.0077404436, -0.02055944, 0.026426593, 0.011... \n", + "4 [-0.062105548, -0.0053322907, 0.031277698, 0.0... \n", + "5 [0.072435796, -0.058001805, -0.019771898, -0.0... \n", + "6 [0.091821924, 0.015197902, 0.07716932, 0.01711... " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "f5e12630-be6b-4188-a925-77117155617b", + "metadata": { + "id": "f5e12630-be6b-4188-a925-77117155617b" + }, + "source": [ + "## Step-9: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "outputId": "31f09b58-7b2d-48bb-9dac-bc0ba9625c01" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Copied output from 'output/05_embeddings_out' --> 'output/output_final'\n" + ] + } + ], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", + "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", + "\n", + "print (f\"โœ… Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "dpk-2-basic-021-py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "06f9b33494984e4885d5aad813d1d2bc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1cb3bbf7d724411cbe9831543a4aecc0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "553f3c16839a49d79591d0fc4862bed6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7053c9606a414e978636a7e241909504": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1cb3bbf7d724411cbe9831543a4aecc0", + "placeholder": "โ€‹", + "style": "IPY_MODEL_06f9b33494984e4885d5aad813d1d2bc", + "value": "โ€‡10/10โ€‡[00:00<00:00,โ€‡349.38it/s]" + } + }, + "724778729161445c98b187031ae4f67c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "97b603697cfa4b4ea4e6735b6768ca35": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e87e8d3262c54cfaaa8768505edacda3", + "IPY_MODEL_b78aa40816e44f7fbebcb24ca68818b3", + "IPY_MODEL_7053c9606a414e978636a7e241909504" + ], + "layout": "IPY_MODEL_da0787b239764847a731083997780a85" + } + }, + "9d184ed175f0403fb03c2e13dfd04e0a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b78aa40816e44f7fbebcb24ca68818b3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9d184ed175f0403fb03c2e13dfd04e0a", + "max": 10, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_724778729161445c98b187031ae4f67c", + "value": 10 + } + }, + "c0eb5bc8f6ee427ca42204b3c56f9a4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "da0787b239764847a731083997780a85": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e87e8d3262c54cfaaa8768505edacda3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_553f3c16839a49d79591d0fc4862bed6", + "placeholder": "โ€‹", + "style": "IPY_MODEL_c0eb5bc8f6ee427ca42204b3c56f9a4e", + "value": "Fetchingโ€‡10โ€‡files:โ€‡100%" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/intro/dpk_intro_1_ray.ipynb b/examples/notebooks/intro/dpk_intro_1_ray.ipynb new file mode 100644 index 000000000..da33a3499 --- /dev/null +++ b/examples/notebooks/intro/dpk_intro_1_ray.ipynb @@ -0,0 +1,4358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", + "metadata": { + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" + }, + "source": [ + "# Data Prep Kit Demo 1 - Ray Version\n", + "\n", + "This notebook will introduce DPK and showcase some of it's capabilities.\n", + "\n", + "Here is the workflow\n", + "\n", + "![](https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "b15976e3", + "metadata": { + "id": "b15976e3" + }, + "source": [ + "## How to run this notebook\n", + "\n", + "Two options:\n", + "\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n", + "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", + "\n", + "The notebook will work as in both environments" + ] + }, + { + "cell_type": "markdown", + "id": "eb8b0d5c", + "metadata": { + "id": "eb8b0d5c" + }, + "source": [ + "## Step-1: Inspect the Data\n", + "\n", + "We will use simple PDFs about Solar system. The files are [here](https://github.com/sujee/data-prep-kit/tree/intro-example1/examples/notebooks/intro/input/solar-system)\n", + "\n", + "- [earth.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf)\n", + "- [mars.pdf](https://github.com/sujee/data-prep-kit/blob/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf)\n" + ] + }, + { + "cell_type": "markdown", + "id": "39a0ab6e", + "metadata": { + "id": "39a0ab6e" + }, + "source": [ + "## Step-2: Figure out Runtime Environment\n", + "\n", + "### 2.1 - Determine runtime\n", + "\n", + "Determine if we are running on Google colab or local python environment" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1fe354b7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1fe354b7", + "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "8e7c104b", + "metadata": { + "id": "8e7c104b" + }, + "source": [ + "### 2.2 -Download Data if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3309799e", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3309799e", + "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c" + }, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " !mkdir -p 'input/solar-system'\n", + " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/earth.pdf'\n", + " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/input/solar-system/mars.pdf'\n", + " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/sujee/data-prep-kit/intro-example1/examples/notebooks/intro/my_utils.py'" + ] + }, + { + "cell_type": "markdown", + "id": "a5dc2b68", + "metadata": { + "id": "a5dc2b68" + }, + "source": [ + "### 2.3 - Install dependencies if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1fcec577", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "1fcec577", + "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10" + }, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " ! pip install --default-timeout=100 \\\n", + " data-prep-toolkit==0.2.1 \\\n", + " data-prep-toolkit-transforms==0.2.1 \\\n", + " data-prep-toolkit-transforms-ray==0.2.1 \\\n", + " deepsearch-toolkit" + ] + }, + { + "cell_type": "markdown", + "id": "243322b8", + "metadata": { + "id": "243322b8" + }, + "source": [ + "### 2.4 - Restart Runtime\n", + "\n", + "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "e8b10be1", + "metadata": { + "id": "e8b10be1" + }, + "source": [ + "## Step-2: Configuration" + ] + }, + { + "cell_type": "markdown", + "id": "356c66f7", + "metadata": { + "id": "356c66f7" + }, + "source": [ + "### 2.1 - Basic Config" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e4YMZrBuFycl", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e4YMZrBuFycl", + "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "33345487", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "33345487", + "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n", + "MY_CONFIG.RAY_NUM_CPUS: 0.8\n", + "MY_CONFIG.RAY_MEMORY_GB: 2\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "## Configuration\n", + "class MyConfig:\n", + " pass\n", + "\n", + "MY_CONFIG = MyConfig ()\n", + "\n", + "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", + "\n", + "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", + "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", + "\n", + "## Embedding model\n", + "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n", + "\n", + "## RAY CONFIGURATION\n", + "### For local runs, we can use more parallelism\n", + "### For google colab, be conservative\n", + "\n", + "if RUNNING_IN_COLAB:\n", + " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", + " MY_CONFIG.RAY_NUM_CPUS = 0.3\n", + " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", + "else: # local run\n", + " num_cpus_available = os.cpu_count()\n", + " # print (num_cpus_available)\n", + "\n", + " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", + " MY_CONFIG.RAY_NUM_CPUS = 0.8\n", + " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", + " # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n", + "\n", + "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n", + "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n", + "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b15e6827", + "metadata": { + "id": "b15e6827" + }, + "outputs": [], + "source": [ + "## Add parent dir to path\n", + "import os,sys\n", + "\n", + "this_dir = os.path.abspath('')\n", + "parent_dir = os.path.dirname(this_dir)\n", + "sys.path.append (os.path.abspath (parent_dir))" + ] + }, + { + "cell_type": "markdown", + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", + "metadata": { + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" + }, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "outputId": "ec5beb05-027a-49eb-9a96-271471619d81" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Cleared output directory\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", + " raise Exception (f\"โŒ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", + "\n", + "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", + "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", + "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n", + "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", + "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", + "\n", + "print (\"โœ… Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", + "metadata": { + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" + }, + "source": [ + "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", + "\n", + "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", + "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", + "metadata": { + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" + }, + "source": [ + "### 3.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" + ] + } + ], + "source": [ + "STAGE = 1\n", + "\n", + "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", + "output_folder = output_parquet_dir\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", + "metadata": { + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" + }, + "source": [ + "### 3.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "outputId": "14a36e73-a186-4431-a755-f46ccb691130" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", + "13:30:44 INFO - pipeline id pipeline_id\n", + "13:30:44 INFO - code location None\n", + "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n", + "13:30:44 INFO - actor creation delay 0\n", + "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", + "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "13:30:44 INFO - Running locally\n", + "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n", + "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 110376.42it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n", + "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n", + "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 73713.60it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:1 completed successfully\n", + "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n", + "Wall time: 31.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from pdf2parquet_transform import (\n", + " pdf2parquet_contents_type_cli_param,\n", + " pdf2parquet_contents_types,\n", + ")\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n", + "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n", + "\n", + "from data_processing.utils import GB, ParamsUtils\n", + "\n", + "\n", + "# create parameters\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n", + "ingest_config = {\n", + " pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n", + "}\n", + "\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + "}\n", + "\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n", + "# create launcher\n", + "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n", + "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "5ca790e0", + "metadata": { + "id": "5ca790e0" + }, + "source": [ + "### 3.3 - Inspect Generated output\n", + "\n", + "Here we should see one entry per input file processed." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fe59563d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 255 + }, + "id": "fe59563d", + "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output dimensions (rows x columns)= (2, 12)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...101162e5639f-f922-4ccc-a041-3cb02f1cfd83pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011f3c0ac2e-1de2-472b-8216-2043f3b3e9d1pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdf
\n", + "
" + ], + "text/plain": [ + " filename contents num_pages \\\n", + "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + "\n", + " num_tables num_doc_elements document_id ext \\\n", + "0 0 11 62e5639f-f922-4ccc-a041-3cb02f1cfd83 pdf \n", + "1 0 11 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.494027 2.015123 earth.pdf " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(5)\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "e5058a21", + "metadata": { + "id": "e5058a21" + }, + "source": [ + "\n", + "### 3.4 - Understand the output\n", + "\n", + "Here are some interesting attributes to note:\n", + "\n", + "- **filename** : original filename\n", + "- **contents** : text\n", + "- **document_id**: unique id (UUID) assignd to this document\n", + "- **hash** : hash of document\n", + "- **pdf_convert_time** : time to convert this pdf in seconds\n", + "\n", + "Let's inspect the **contents** column. See how the text is being divided up!" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f870e624", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f870e624", + "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_name': '',\n", + " 'description': {'logs': []},\n", + " 'equations': [],\n", + " 'figures': [],\n", + " 'file-info': {'#-pages': 1,\n", + " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", + " 'filename': 'mars.pdf',\n", + " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", + " 'model': 'default',\n", + " 'page': 1}]},\n", + " 'footnotes': [],\n", + " 'main-text': [{'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.35137939,\n", + " 654.45184326,\n", + " 169.88169861,\n", + " 667.98492432],\n", + " 'page': 1,\n", + " 'span': [0, 4]}],\n", + " 'text': 'Mars',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.09541321,\n", + " 630.68127441,\n", + " 210.66503906,\n", + " 642.34405518],\n", + " 'page': 1,\n", + " 'span': [0, 12]}],\n", + " 'text': 'Solar System',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.84518433,\n", + " 588.96014404,\n", + " 479.40917969,\n", + " 623.02520752],\n", + " 'page': 1,\n", + " 'span': [0, 205]}],\n", + " 'text': 'Our solar system is a vast and fascinating expanse, '\n", + " 'comprising eight planets, five dwarf planets, '\n", + " 'numerous moons, asteroids, comets, and other '\n", + " 'celestial bodies. At its center lies the star we call '\n", + " 'the Sun.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.18510437,\n", + " 570.83258057,\n", + " 374.99838257,\n", + " 581.07043457],\n", + " 'page': 1,\n", + " 'span': [0, 54]}],\n", + " 'text': 'For more details about the Solar system see Chapter '\n", + " '1.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.22866821,\n", + " 542.98168945,\n", + " 163.86282349,\n", + " 554.45288086],\n", + " 'page': 1,\n", + " 'span': [0, 4]}],\n", + " 'text': 'Mars',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.87440491,\n", + " 500.84011841,\n", + " 477.48345947,\n", + " 534.55810547],\n", + " 'page': 1,\n", + " 'span': [0, 196]}],\n", + " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", + " 'desert world with a thin atmosphere composed '\n", + " 'primarily of carbon dioxide. Its reddish hue comes '\n", + " 'from iron oxide, or rust, prevalent on its surface.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.2026062,\n", + " 482.90710449,\n", + " 237.04431152,\n", + " 493.07443237],\n", + " 'page': 1,\n", + " 'span': [0, 23]}],\n", + " 'text': 'Basic facts about Mars:',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 453.019104,\n", + " 477.48171997,\n", + " 474.9703064],\n", + " 'page': 1,\n", + " 'span': [0, 78]}],\n", + " 'text': 'ยท Distance from the Sun: Average of 228 million '\n", + " 'kilometers (142 million miles)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 440.79351807,\n", + " 431.73287964,\n", + " 451.2142334],\n", + " 'page': 1,\n", + " 'span': [0, 64]}],\n", + " 'text': 'ยท Rotation Period: 24.6 hours (one Martian day - '\n", + " 'called a \"sol\")',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 429.10913086,\n", + " 365.9559021,\n", + " 438.83737183],\n", + " 'page': 1,\n", + " 'span': [0, 44]}],\n", + " 'text': 'ยท Moons: Two small moons, Phobos and Deimos.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Page-footer',\n", + " 'prov': [{'bbox': [303.13299561,\n", + " 87.20314026,\n", + " 308.11428833,\n", + " 96.51646423],\n", + " 'page': 1,\n", + " 'span': [0, 1]}],\n", + " 'text': '1',\n", + " 'type': 'page-footer'}],\n", + " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", + " 'page-footers': [],\n", + " 'page-headers': [],\n", + " 'tables': [],\n", + " 'type': 'pdf-document'}\n" + ] + } + ], + "source": [ + "import pprint\n", + "import json\n", + "\n", + "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", + "# json.loads(output_df.iloc[0, ]['contents'])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e1a10c2d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e1a10c2d", + "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_name': '',\n", + " 'description': {'logs': []},\n", + " 'equations': [],\n", + " 'figures': [],\n", + " 'file-info': {'#-pages': 1,\n", + " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", + " 'filename': 'earth.pdf',\n", + " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", + " 'model': 'default',\n", + " 'page': 1}]},\n", + " 'footnotes': [],\n", + " 'main-text': [{'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.30961609,\n", + " 654.45184326,\n", + " 174.04208374,\n", + " 667.93347168],\n", + " 'page': 1,\n", + " 'span': [0, 5]}],\n", + " 'text': 'Earth',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.12528992,\n", + " 630.69073486,\n", + " 210.66503906,\n", + " 642.27935791],\n", + " 'page': 1,\n", + " 'span': [0, 12]}],\n", + " 'text': 'Solar System',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.87112427,\n", + " 588.96014404,\n", + " 479.40917969,\n", + " 623.04595947],\n", + " 'page': 1,\n", + " 'span': [0, 205]}],\n", + " 'text': 'Our solar system is a vast and fascinating expanse, '\n", + " 'comprising eight planets, five dwarf planets, '\n", + " 'numerous moons, asteroids, comets, and other '\n", + " 'celestial bodies. At its center lies the star we call '\n", + " 'the Sun.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.20942688,\n", + " 570.81555176,\n", + " 375.57919312,\n", + " 581.08459473],\n", + " 'page': 1,\n", + " 'span': [0, 54]}],\n", + " 'text': 'For more details about our Solar system see Chapter '\n", + " '1.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Section-header',\n", + " 'prov': [{'bbox': [133.15542603,\n", + " 542.98168945,\n", + " 167.32983398,\n", + " 554.36669922],\n", + " 'page': 1,\n", + " 'span': [0, 5]}],\n", + " 'text': 'Earth',\n", + " 'type': 'subtitle-level-1'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [132.91053772,\n", + " 512.46295166,\n", + " 477.84887695,\n", + " 534.48431396],\n", + " 'page': 1,\n", + " 'span': [0, 107]}],\n", + " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", + " 'planet. Earth is the only place we know of with life.',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Text',\n", + " 'prov': [{'bbox': [133.30151367,\n", + " 494.86206055,\n", + " 240.17156982,\n", + " 505.07229614],\n", + " 'page': 1,\n", + " 'span': [0, 24]}],\n", + " 'text': 'Basic facts about Earth:',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 464.97409058,\n", + " 477.47979736,\n", + " 487.02810669],\n", + " 'page': 1,\n", + " 'span': [0, 79]}],\n", + " 'text': 'ยท Distance from the Sun: Average of 149.6 million '\n", + " 'kilometers (93 million miles)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 452.86901855,\n", + " 317.90722656,\n", + " 463.24041748],\n", + " 'page': 1,\n", + " 'span': [0, 37]}],\n", + " 'text': 'ยท Rotation Period: 24 hours (one day)',\n", + " 'type': 'paragraph'},\n", + " {'name': 'List-item',\n", + " 'prov': [{'bbox': [145.94500732,\n", + " 440.71496582,\n", + " 396.66357422,\n", + " 451.19915771],\n", + " 'page': 1,\n", + " 'span': [0, 52]}],\n", + " 'text': 'ยท Moons: One moon, called Luna or simply \"the Moon\".',\n", + " 'type': 'paragraph'},\n", + " {'name': 'Page-footer',\n", + " 'prov': [{'bbox': [303.13299561,\n", + " 87.20314026,\n", + " 308.11428833,\n", + " 96.53633118],\n", + " 'page': 1,\n", + " 'span': [0, 1]}],\n", + " 'text': '1',\n", + " 'type': 'page-footer'}],\n", + " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", + " 'page-footers': [],\n", + " 'page-headers': [],\n", + " 'tables': [],\n", + " 'type': 'pdf-document'}\n" + ] + } + ], + "source": [ + "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" + ] + }, + { + "cell_type": "markdown", + "id": "72274586", + "metadata": { + "id": "72274586" + }, + "source": [ + "## Step-4: Doc chunks\n", + "\n", + "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", + "\n", + "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", + "\n", + "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", + "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", + "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", + "which provides the required JSON structure." + ] + }, + { + "cell_type": "markdown", + "id": "96198fa6", + "metadata": { + "id": "96198fa6" + }, + "source": [ + "### 4.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "305f00a3", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "305f00a3", + "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" + ] + } + ], + "source": [ + "STAGE = 2\n", + "\n", + "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_chunk_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "369f2cd1", + "metadata": { + "id": "369f2cd1" + }, + "source": [ + "### 4.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5b7b18d5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5b7b18d5", + "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", + "13:31:12 INFO - pipeline id pipeline_id\n", + "13:31:12 INFO - code location None\n", + "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:12 INFO - actor creation delay 0\n", + "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", + "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:12 INFO - Running locally\n", + "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n", + "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n", + "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:2 completed successfully\n", + "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n", + "Wall time: 18.9 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # doc_chunk arguments\n", + " # ...\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "213afdf6", + "metadata": { + "id": "213afdf6" + }, + "source": [ + "### 4.3 - Inspect Generated output\n", + "\n", + "We would see documents are split into many chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d8138d43", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 897 + }, + "id": "d8138d43", + "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Files processed : 2\n", + "Chunks created : 8\n", + "Input data dimensions (rows x columns)= (2, 12)\n", + "Output data dimensions (rows x columns)= (8, 16)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 mars.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "7 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (f\"Files processed : {input_df.shape[0]:,}\")\n", + "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "9e9ca75c", + "metadata": { + "id": "9e9ca75c" + }, + "source": [ + "### 4.4 - Understanding the Output\n", + "\n", + "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", + "\n", + "See how **document_id** is carried throughout. This helps us identify original documents.\n", + "\n", + "Also note **contents** is now plain text (not JSON as before)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3090c950", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "3090c950", + "outputId": "3f542446-2cfa-404c-c642-3732f7b74568" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\nยท Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" + ], + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", + "1 mars.pdf Solar System\\nFor more details about the Solar...\n", + "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "3 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", + "5 earth.pdf Solar System\\nFor more details about our Solar...\n", + "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "7 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d5f151ae", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d5f151ae", + "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about the Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 3------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 3------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "20217298", + "metadata": { + "id": "20217298" + }, + "source": [ + "## Step-5: DOC ID generation\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "\n", + "**This is a pre-requisite for fuzzy dedup** in the pipeline." + ] + }, + { + "cell_type": "markdown", + "id": "66811f5b", + "metadata": { + "id": "66811f5b" + }, + "source": [ + "### 5.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1f747c0d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1f747c0d", + "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" + ] + } + ], + "source": [ + "\n", + "# Input for this stage is the output of exact dedeup component\n", + "# output of this component makes it possible for fdedup component to run on data.\n", + "\n", + "STAGE = 3\n", + "\n", + "input_folder = output_chunk_dir\n", + "output_folder = output_docid_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "18aa0fe1", + "metadata": { + "id": "18aa0fe1" + }, + "source": [ + "### 5.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f6e9e145", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f6e9e145", + "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", + "13:31:29 INFO - pipeline id pipeline_id\n", + "13:31:29 INFO - code location None\n", + "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:29 INFO - actor creation delay 0\n", + "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", + "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:29 INFO - Running locally\n", + "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n", + "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", + "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:3 completed successfully\n", + "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n", + "Wall time: 15.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # doc id configuration\n", + " \"doc_id_doc_column\": \"contents\",\n", + " \"doc_id_hash_column\": \"chunk_hash\",\n", + " \"doc_id_int_column\": \"chunk_id\",\n", + "}\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# launch\n", + "\n", + "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "4954402f", + "metadata": { + "id": "4954402f" + }, + "source": [ + "### 5.3 - Inspect Generated output\n", + "\n", + "You will notice we have two extra columns\n", + "\n", + "- **hash_column**\n", + "- **int_id_column**\n", + "\n", + "But still the same number or rows as before" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1911179a", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 860 + }, + "id": "1911179a", + "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 16)\n", + "Output data dimensions (rows x columns)= (8, 18)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 mars.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "7 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", + "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "852829dc", + "metadata": { + "id": "852829dc" + }, + "source": [ + "## Step-6: Exact Dedup\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", + "metadata": { + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" + }, + "source": [ + "### 6.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4c7a1b94", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4c7a1b94", + "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" + ] + } + ], + "source": [ + "STAGE = 4\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_exact_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", + "metadata": { + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" + }, + "source": [ + "### 6.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "13:31:45 INFO - pipeline id pipeline_id\n", + "13:31:45 INFO - code location None\n", + "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:45 INFO - actor creation delay 0\n", + "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", + "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:45 INFO - Running locally\n", + "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n", + "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n", + "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:4 completed successfully\n", + "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n", + "Wall time: 15.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # ededup parameters\n", + " \"ededup_hash_cpu\": 0.5,\n", + " \"ededup_num_hashes\": 2,\n", + " \"ededup_doc_column\": \"contents\",\n", + " \"ededup_doc_id_column\": \"chunk_hash\",\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "eaf1c3c3", + "metadata": { + "id": "eaf1c3c3" + }, + "source": [ + "### 6.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d824ebf6", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 815 + }, + "id": "d824ebf6", + "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 18)\n", + "Output data dimensions (rows x columns)= (7, 19)\n", + "Input chunks before exact dedupe : 8\n", + "Output chunks after exact dedupe : 7\n", + "Duplicate chunks removed : 1\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "6 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", + "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + "\n", + " chunk_hash chunk_id \\\n", + "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", + "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", + "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", + "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", + "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", + "\n", + " removed \n", + "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + "5 [] \n", + "6 [] " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "82cc9bb0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "id": "82cc9bb0", + "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" + ], + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nFor more details about the Solar...\n", + "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", + "4 earth.pdf Solar System\\nFor more details about our Solar...\n", + "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "6 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "cc61dffa", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cc61dffa", + "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "For more details about the Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 1------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 2------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 3------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "383f40ba", + "metadata": { + "id": "383f40ba" + }, + "source": [ + "### 6.4 - Understanding the output\n", + "\n", + "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", + "\n", + "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", + "\n", + "```text\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "85309751-8556-41c6-ac32-84acc941bc8d", + "metadata": { + "id": "85309751-8556-41c6-ac32-84acc941bc8d" + }, + "source": [ + "## Step-7: Fuzzy Dedup\n", + "\n", + "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n", + "the data further.\n", + "\n", + "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc." + ] + }, + { + "cell_type": "markdown", + "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", + "metadata": { + "id": "fcf574a3-b287-419c-9c86-07b828b41ca6" + }, + "source": [ + "### 7.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", + "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n" + ] + } + ], + "source": [ + "## Input to this component is the output of doc_id generator component.\n", + "\n", + "STAGE = 5\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_fuzzy_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", + "metadata": { + "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3" + }, + "source": [ + "### 7.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", + "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", + "13:32:00 INFO - pipeline id pipeline_id\n", + "13:32:00 INFO - code location None\n", + "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:00 INFO - actor creation delay 0\n", + "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n", + "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:00 INFO - Running locally\n", + "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%) in 0.064 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n", + "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n", + "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n", + "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n", + "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n", + "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:5 completed successfully\n", + "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n", + "Wall time: 36.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "\n", + "# create parameters\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # Orchestration parameters\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # columns used\n", + " \"fdedup_doc_column\": \"contents\",\n", + " \"fdedup_id_column\": \"chunk_id\",\n", + " \"fdedup_cluster_column\": \"chunk_hash\",\n", + " # infrastructure\n", + " \"fdedup_bucket_cpu\": 0.3,\n", + " \"fdedup_doc_cpu\": 0.3,\n", + " \"fdedup_mhash_cpu\": 0.3,\n", + " \"fdedup_num_doc_actors\": 1,\n", + " \"fdedup_num_bucket_actors\": 1,\n", + " \"fdedup_num_minhash_actors\": 1,\n", + " \"fdedup_num_preprocessors\": 1,\n", + " # fuzzy parameters\n", + " \"fdedup_num_permutations\": 64,\n", + " \"fdedup_threshold\": 0.7, # (default 0.8)\n", + " \"fdedup_shingles_size\": 5,\n", + " \"fdedup_delimiters\": \" \"\n", + "}\n", + "\n", + "# Pass commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# launch\n", + "\n", + "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "a6f8cd11", + "metadata": { + "id": "a6f8cd11" + }, + "source": [ + "### 7.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "e899ad60", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 677 + }, + "id": "e899ad60", + "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (8, 18)\n", + "Output data dimensions (rows x columns)= (6, 18)\n", + "Duplicate chunks removed by fuzzy-dedupe: 2\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hash
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id chunk_id chunk_hash \n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", + "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", + "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", + "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ab7ea52b", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 + }, + "id": "ab7ea52b", + "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nFor more details about our Solar...
4earth.pdfEarth\\nEarth is the third planet from the Sun....
5earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", + "
" + ], + "text/plain": [ + " filename contents\n", + "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", + "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", + "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", + "3 earth.pdf Solar System\\nFor more details about our Solar...\n", + "4 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", + "5 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df[['filename', 'contents']]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "6bdd3515", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6bdd3515", + "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "========== mars.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "-------\n", + "-------Chunk 1------\n", + "Mars\n", + "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", + "-------\n", + "-------Chunk 2------\n", + "Basic facts about Mars:\n", + "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", + "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", + "ยท Moons: Two small moons, Phobos and Deimos.\n", + "-------\n", + "========== earth.pdf ===========\n", + "-------Chunk 0------\n", + "Solar System\n", + "For more details about our Solar system see Chapter 1.\n", + "-------\n", + "-------Chunk 1------\n", + "Earth\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "-------\n", + "-------Chunk 2------\n", + "Earth\n", + "Basic facts about Earth:\n", + "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "ยท Rotation Period: 24 hours (one day)\n", + "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "-------\n" + ] + } + ], + "source": [ + "for f in output_df['filename'].unique():\n", + " print ('==========' , f, '===========')\n", + " chunks = output_df[output_df['filename'] == f]['contents']\n", + " for idx , chunk in enumerate(chunks):\n", + " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + ] + }, + { + "cell_type": "markdown", + "id": "2b34d9c6", + "metadata": { + "id": "2b34d9c6" + }, + "source": [ + "### 7.4- Understanding the output\n", + "\n", + "So we started with 7 rows and ended up with 6. Fuzzy dedupe removed the following **very similar** chunk.\n", + "\n", + "These are pretty similar chunks except for the words 'the' and 'our'\n", + "\n", + "**earth.pdf**\n", + "\n", + "`For more details about *our* Solar system see Chapter 1.`\n", + "\n", + "**mars.pdf**\n", + "\n", + "`For more details about *the* Solar system see Chapter 1.`\n", + "\n", + "Pretty neat, eh? ๐Ÿ‘\n", + "\n", + "### Configuring Fuzzy de-dupe\n", + "\n", + "You can tweak fuzzy dedupe by tweaking the following parameters\n", + "\n", + "```python\n", + "# fuzzy parameters\n", + " \"fdedup_num_permutations\": 64,\n", + " \"fdedup_threshold\": 0.7, # (default 0.8)\n", + " \"fdedup_shingles_size\": 5,\n", + " \"fdedup_delimiters\": \" \"\n", + "```\n", + "\n", + "In our case, we set `fdedup_threshold` parameter to 0.7. \n" + ] + }, + { + "cell_type": "markdown", + "id": "5370950a-2a3a-4143-8218-f9b4808099ba", + "metadata": { + "id": "5370950a-2a3a-4143-8218-f9b4808099ba" + }, + "source": [ + "## Step-8: Text encoding\n", + "\n", + "Encode text for the vector storage." + ] + }, + { + "cell_type": "markdown", + "id": "85aba685", + "metadata": { + "id": "85aba685" + }, + "source": [ + "### 8.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n" + ] + } + ], + "source": [ + "STAGE = 6\n", + "\n", + "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_embeddings_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "c97545f4", + "metadata": { + "id": "c97545f4" + }, + "source": [ + "### 8.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 914, + "referenced_widgets": [ + "8b7571c585df431eb901fcdebdf8177e", + "06107a2f48b3491f91bbe84e46e10ba0", + "bd74356eca18423aa0373c808d9097e3", + "7e13e8779a81400f996d4428c74acfaf", + "a75892696be546a3970962bae7bf732a", + "68997339f13240a4824a9e416096bee4", + "919b086abd314077bbff75687392bd91", + "b4c209371e7a403986991a786cfb296d", + "6c08de2dd9a2402c90b1a7a645db9b13", + "91fff81a1de8487c9009e872b751edb0", + "ada62d24cbcf4361acbb21808f334d33" + ] + }, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", + "13:32:37 INFO - pipeline id pipeline_id\n", + "13:32:37 INFO - code location None\n", + "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:37 INFO - actor creation delay 0\n", + "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n", + "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:37 INFO - Running locally\n", + "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n", + "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n", + "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:6 completed successfully\n", + "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n", + "Wall time: 22.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # where to run\n", + " \"run_locally\": True,\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # orchestrator\n", + " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", + " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", + " # text_encoder\n", + " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", + "}\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "# create launcher\n", + "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n", + "# Launch the ray actor(s) to process the input\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"โŒ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "b734852c", + "metadata": { + "id": "b734852c" + }, + "source": [ + "### 8.3 - Inspect Generated output\n", + "\n", + "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "7b1c1d09", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 659 + }, + "id": "7b1c1d09", + "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (6, 18)\n", + "Output data dimensions (rows x columns)= (6, 19)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hashembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1[0.0077404897, -0.020559434, 0.026426662, 0.01...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1[0.07728298, 0.024971062, -0.04318075, 0.05809...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1[0.1059802, 0.025460616, 0.02362733, 0.0390564...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15[-0.062105577, -0.0053322953, 0.03127779, 0.04...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1[0.0724358, -0.058001805, -0.01977186, -0.0243...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1[0.091821924, 0.015197907, 0.07716932, 0.01711...
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 mars.pdf 1 0 11 pdf \n", + "1 mars.pdf 1 0 11 pdf \n", + "2 mars.pdf 1 0 11 pdf \n", + "3 earth.pdf 1 0 11 pdf \n", + "4 earth.pdf 1 0 11 pdf \n", + "5 earth.pdf 1 0 11 pdf \n", + "\n", + " hash size \\\n", + "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", + "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", + "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + "\n", + " source_document_id \\\n", + "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", + "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + "\n", + " contents doc_jsonpath \\\n", + "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", + "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", + "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", + "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", + "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", + "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", + "\n", + " page_number bbox \\\n", + "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", + "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", + "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", + "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", + "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", + "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + "\n", + " document_id chunk_id chunk_hash \\\n", + "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", + "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", + "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", + "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", + "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", + "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 \n", + "\n", + " embeddings \n", + "0 [0.0077404897, -0.020559434, 0.026426662, 0.01... \n", + "1 [0.07728298, 0.024971062, -0.04318075, 0.05809... \n", + "2 [0.1059802, 0.025460616, 0.02362733, 0.0390564... \n", + "3 [-0.062105577, -0.0053322953, 0.03127779, 0.04... \n", + "4 [0.0724358, -0.058001805, -0.01977186, -0.0243... \n", + "5 [0.091821924, 0.015197907, 0.07716932, 0.01711... " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from my_utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "f5e12630-be6b-4188-a925-77117155617b", + "metadata": { + "id": "f5e12630-be6b-4188-a925-77117155617b" + }, + "source": [ + "## Step-9: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n" + ] + } + ], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", + "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", + "\n", + "print (f\"โœ… Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "dc0a6728", + "metadata": { + "id": "dc0a6728" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "dpk-2-basic-021-py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "06107a2f48b3491f91bbe84e46e10ba0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4", + "placeholder": "โ€‹", + "style": "IPY_MODEL_919b086abd314077bbff75687392bd91", + "value": "" + } + }, + "68997339f13240a4824a9e416096bee4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6c08de2dd9a2402c90b1a7a645db9b13": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7e13e8779a81400f996d4428c74acfaf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0", + "placeholder": "โ€‹", + "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33", + "value": "โ€‡0/0โ€‡[00:00<?,โ€‡?it/s]" + } + }, + "8b7571c585df431eb901fcdebdf8177e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0", + "IPY_MODEL_bd74356eca18423aa0373c808d9097e3", + "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf" + ], + "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a" + } + }, + "919b086abd314077bbff75687392bd91": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "91fff81a1de8487c9009e872b751edb0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a75892696be546a3970962bae7bf732a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ada62d24cbcf4361acbb21808f334d33": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b4c209371e7a403986991a786cfb296d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "bd74356eca18423aa0373c808d9097e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13", + "value": 0 + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/intro/images/data-prep-kit-3-workflow.excalidraw b/examples/notebooks/intro/images/data-prep-kit-3-workflow.excalidraw new file mode 100644 index 000000000..c0525c556 --- /dev/null +++ b/examples/notebooks/intro/images/data-prep-kit-3-workflow.excalidraw @@ -0,0 +1,2832 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "type": "image", + "version": 128, + "versionNonce": 146671843, + "index": "b45", + "isDeleted": false, + "id": "nQdFTOsh8Rjwn3poFcnOO", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 258.1818181818182, + "y": 213.63636363636363, + "strokeColor": "transparent", + "backgroundColor": "transparent", + "width": 64, + "height": 64, + "seed": 222183398, + "groupIds": [ + "4aSnKsxGoqeqA7eYu4s2e" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726186954844, + "link": null, + "locked": false, + "status": "saved", + "fileId": "83ba3062a1490699e3ccc129acb25b1f4ec5534d", + "scale": [ + 1, + 1 + ] + }, + { + "type": "image", + "version": 240, + "versionNonce": 2054222979, + "index": "b46", + "isDeleted": false, + "id": "hlPJZs7lUbLYhuRbSmYHs", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 260.90909090909093, + "y": 285.4545454545455, + "strokeColor": "transparent", + "backgroundColor": "transparent", + "width": 64, + "height": 64, + "seed": 961787386, + "groupIds": [ + "4aSnKsxGoqeqA7eYu4s2e" + ], + "frameId": null, + "roundness": null, + "boundElements": [ + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" + }, + { + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + } + ], + "updated": 1726186954844, + "link": null, + "locked": false, + "status": "saved", + "fileId": "83ba3062a1490699e3ccc129acb25b1f4ec5534d", + "scale": [ + 1, + 1 + ] + }, + { + "type": "arrow", + "version": 2550, + "versionNonce": 1240871476, + "index": "b47", + "isDeleted": false, + "id": "FVhCmDYbWjGck9rgcESwp", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 823.5583207607388, + "y": 273.73602641681657, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "width": 154.2895204048931, + "height": 2.3372664247598323, + "seed": 1954615226, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726708776348, + "link": null, + "locked": false, + "startBinding": { + "elementId": "Wxv71stEiYRpNjyhzzXgO", + "focus": 1.202109076005182, + "gap": 9.103775306193256, + "fixedPoint": null + }, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 154.2895204048931, + 2.3372664247598323 + ] + ] + }, + { + "type": "text", + "version": 324, + "versionNonce": 1281521869, + "index": "b4M", + "isDeleted": false, + "id": "zSJvmm-7DrsR5-qRb96Kl", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 595.4118679291607, + "y": 242.27481706603328, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "width": 141.51840079198635, + "height": 59.453152259008114, + "seed": 409665722, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [ + { + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + }, + { + "id": "0wYqjwjKHCGbx7CfmDR__", + "type": "arrow" + } + ], + "updated": 1726186894805, + "link": null, + "locked": false, + "fontSize": 23.781260903603247, + "fontFamily": 1, + "text": "2. split into\nchunks", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "2. split into\nchunks", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "arrow", + "version": 848, + "versionNonce": 138401069, + "index": "b4N", + "isDeleted": false, + "id": "JMprrs8mNVD4CrqUlVm7i", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 329.1268602850381, + "y": 278.24885892455757, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", + "width": 185.2530890548909, + "height": 2.823455039174007, + "seed": 1319994682, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726186962183, + "link": null, + "locked": false, + "startBinding": { + "elementId": "hlPJZs7lUbLYhuRbSmYHs", + "focus": -1.189794049219074, + "gap": 7.205686529987929, + "fixedPoint": null + }, + "endBinding": { + "elementId": "YFlD_rDw6IwCctPG9BjYf", + "focus": 1.1403432588201572, + "gap": 6.460959750980123, + "fixedPoint": null + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 185.2530890548909, + -2.823455039174007 + ] + ] + }, + { + "type": "text", + "version": 757, + "versionNonce": 361576332, + "index": "b4O", + "isDeleted": false, + "id": "G0k27V_VE7lyh7YGr_fts", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 1128.9917648038, + "y": 212.9780740734803, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 110.85037231445312, + "height": 58.225670034857664, + "seed": 970452474, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [ + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" + } + ], + "updated": 1726708803406, + "link": null, + "locked": false, + "fontSize": 23.290268013943066, + "fontFamily": 1, + "text": "4. dedupe\n(exact)", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "4. dedupe\n(exact)", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 598, + "versionNonce": 1689279715, + "index": "b4g", + "isDeleted": false, + "id": "XUbC5cWQCm-GEFrdqZW7g", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 333.94038113680745, + "y": 243.15978750685963, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "width": 173.54608154296875, + "height": 28.457738187179977, + "seed": 1458850132, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [ + { + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + } + ], + "updated": 1726187078639, + "link": null, + "locked": false, + "fontSize": 22.766190549743982, + "fontFamily": 1, + "text": "1. extract text", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1. extract text", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "image", + "version": 145, + "versionNonce": 1461008621, + "index": "b4h", + "isDeleted": false, + "id": "XH-Rt0Q5-K2g4tM9reh76", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 520.8409090909091, + "y": 209.88636363636368, + "strokeColor": "transparent", + "backgroundColor": "transparent", + "width": 64, + "height": 64, + "seed": 1159948140, + "groupIds": [ + "KKvJ56bTHwzAbN8YXYU0-" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726186894805, + "link": null, + "locked": false, + "status": "saved", + "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "scale": [ + 1, + 1 + ] + }, + { + "type": "image", + "version": 193, + "versionNonce": 1127846733, + "index": "b4i", + "isDeleted": false, + "id": "YFlD_rDw6IwCctPG9BjYf", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 520.8409090909091, + "y": 279.8863636363637, + "strokeColor": "transparent", + "backgroundColor": "transparent", + "width": 64, + "height": 64, + "seed": 1369151980, + "groupIds": [ + "KKvJ56bTHwzAbN8YXYU0-" + ], + "frameId": null, + "roundness": null, + "boundElements": [ + { + "id": "0wYqjwjKHCGbx7CfmDR__", + "type": "arrow" + }, + { + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + } + ], + "updated": 1726186894805, + "link": null, + "locked": false, + "status": "saved", + "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "scale": [ + 1, + 1 + ] + }, + { + "type": "arrow", + "version": 753, + "versionNonce": 1832909987, + "index": "b4j", + "isDeleted": false, + "id": "0wYqjwjKHCGbx7CfmDR__", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 587.6995151292258, + "y": 276.08728311464677, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", + "width": 160.10395921482052, + "height": 0.6238794650969908, + "seed": 1397245780, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726186894829, + "link": null, + "locked": false, + "startBinding": { + "elementId": "YFlD_rDw6IwCctPG9BjYf", + "focus": -1.1101505124640194, + "gap": 3.799080521716917, + "fixedPoint": null + }, + "endBinding": { + "elementId": "zSJvmm-7DrsR5-qRb96Kl", + "focus": -0.1259939432648205, + "gap": 10.873205622899263, + "fixedPoint": null + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 160.10395921482052, + -0.6238794650969908 + ] + ] + }, + { + "type": "text", + "version": 19, + "versionNonce": 1725165603, + "index": "b4t", + "isDeleted": false, + "id": "56KAsZE3Fub50OzL9XJ35", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 344.7055268721148, + "y": 290.01136363636374, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 137.6798553466797, + "height": 25, + "seed": 961622755, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726187031887, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "(pdf2parquet)", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "(pdf2parquet)", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 89, + "versionNonce": 1217800429, + "index": "b4u", + "isDeleted": false, + "id": "GEwyTqhl4LrSwcaOeKRT5", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 514.7055268721148, + "y": 356.01136363636374, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 74.97993469238281, + "height": 50, + "seed": 31755757, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726187172155, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "parquet\nfiles", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "parquet\nfiles", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 273, + "versionNonce": 821721012, + "index": "b5F", + "isDeleted": false, + "id": "ZGkHBN9UBrJLYPIlm-KTj", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1355.555487199263, + "y": 305.51136363636374, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 118.5198974609375, + "height": 50, + "seed": 1591407981, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708923087, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "duplicate 'B'\nis removed", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "duplicate 'B'\nis removed", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 747, + "versionNonce": 104645940, + "index": "b5G", + "isDeleted": false, + "id": "DolT9H5aqzEugA7sUfNlx", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 827.643003983931, + "y": 226.3985286189349, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 166.41502380371094, + "height": 29.112835017428832, + "seed": 466678605, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708795102, + "link": null, + "locked": false, + "fontSize": 23.290268013943066, + "fontFamily": 1, + "text": "3. document id", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "3. document id", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "arrow", + "version": 1071, + "versionNonce": 474965812, + "index": "b5U", + "isDeleted": false, + "id": "cXhTkxU13WdQeAv3Z_1mR", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 1318.993474938044, + "y": 401.3233033689122, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", + "width": 0.8539592148204065, + "height": 113.62612053490295, + "seed": 605419139, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726709016812, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 0.8539592148204065, + 113.62612053490295 + ] + ] + }, + { + "type": "text", + "version": 976, + "versionNonce": 988237964, + "index": "b5V", + "isDeleted": false, + "id": "Ba_pxAykcwH_ZsTbAtduc", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 1218.815207047896, + "y": 429.9549461276493, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 184.07017517089844, + "height": 29.112835017428832, + "seed": 1665190893, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726709020882, + "link": null, + "locked": false, + "fontSize": 23.290268013943066, + "fontFamily": 1, + "text": "5. fuzzy dedupe", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5. fuzzy dedupe", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 580, + "versionNonce": 693951668, + "index": "b5h", + "isDeleted": false, + "id": "XFHbtP2KmiHNNjZhz8ajW", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1299.1022727272725, + "y": 517.40625, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 410701101, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "OdGsWefGyr6uqMl0wC6mH" + } + ], + "updated": 1726708989657, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 323, + "versionNonce": 1216816692, + "index": "b5i", + "isDeleted": false, + "id": "OdGsWefGyr6uqMl0wC6mH", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1315.9786418568, + "y": 522.40625, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 13.519989013671875, + "height": 25, + "seed": 593665933, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708989657, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "XFHbtP2KmiHNNjZhz8ajW", + "originalText": "A", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 573, + "versionNonce": 1856782260, + "index": "b5j", + "isDeleted": false, + "id": "NzWqph0M7tEkeTDKLPGZR", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1301.1931818181815, + "y": 564.5880681818182, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 2053187053, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "K1QK2dyVWiWfd32P8ovQK" + } + ], + "updated": 1726708989657, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 264, + "versionNonce": 334637364, + "index": "b5k", + "isDeleted": false, + "id": "K1QK2dyVWiWfd32P8ovQK", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1317.219552473588, + "y": 569.5880681818182, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 1350557773, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708989657, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "NzWqph0M7tEkeTDKLPGZR", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 680, + "versionNonce": 1002365620, + "index": "b5l", + "isDeleted": false, + "id": "Lf5-FqrnO7iDVhOKUtEnT", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1306.9204545454545, + "y": 619.3267045454547, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 999837357, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "cTJ-8HZCMcNbXqDHggxAH" + } + ], + "updated": 1726708989657, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 375, + "versionNonce": 213412916, + "index": "b5m", + "isDeleted": false, + "id": "cTJ-8HZCMcNbXqDHggxAH", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1324.2668248956852, + "y": 624.3267045454547, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 12.579986572265625, + "height": 25, + "seed": 1515450637, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708989657, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "C", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Lf5-FqrnO7iDVhOKUtEnT", + "originalText": "C", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 141, + "versionNonce": 1757726132, + "index": "b5n", + "isDeleted": false, + "id": "LK6nmMo09HhGvAeViRfcK", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1274.397727272727, + "y": 523.3664772727274, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 975980397, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708989657, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 196, + "versionNonce": 761917108, + "index": "b5o", + "isDeleted": false, + "id": "LbPBuhQ2btuEnjbeSDvuK", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1278.397727272727, + "y": 569.6164772727275, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 2104152525, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708993287, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "2", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 385, + "versionNonce": 800257204, + "index": "b5p", + "isDeleted": false, + "id": "tEnh5H4Dm1tA62FJY7ZnT", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1279.647727272727, + "y": 629.6164772727275, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 1129349773, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726709003336, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "5", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 307, + "versionNonce": 51819060, + "index": "b5q", + "isDeleted": false, + "id": "TExMhRi4612k0BcybcpHE", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1251.2855058149858, + "y": 678.5113636363637, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 143.59986877441406, + "height": 50, + "seed": 2082336653, + "groupIds": [ + "XhxUNIV4RRXanIHzjH6vP" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708989657, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "near duplicate \nA' is removed", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "near duplicate \nA' is removed", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "arrow", + "version": 1039, + "versionNonce": 199529869, + "index": "b5r", + "isDeleted": false, + "id": "KvvwHoDnDT0vBh2bOfiTz", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 1245.243474938044, + "y": 579.5733033689121, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", + "width": 192.8960407851796, + "height": 1.126120534903066, + "seed": 1004556899, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726188444758, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -192.8960407851796, + 1.126120534903066 + ] + ] + }, + { + "type": "text", + "version": 989, + "versionNonce": 923042467, + "index": "b5s", + "isDeleted": false, + "id": "cPSHqIr9Peb5h5TNxl3Bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 0, + "opacity": 100, + "angle": 0, + "x": 1100.5103669600053, + "y": 536.2049461276495, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 138.99639892578125, + "height": 29.112835017428832, + "seed": 865272429, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726188447614, + "link": null, + "locked": false, + "fontSize": 23.290268013943066, + "fontFamily": 1, + "text": "6. vectorize", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "6. vectorize", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "diamond", + "version": 103, + "versionNonce": 679668419, + "index": "b5vV", + "isDeleted": false, + "id": "tPvUjMUp7lW3F8V3H2MGV", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 960.0454545454546, + "y": 515.5113636363637, + "strokeColor": "#1e1e1e", + "backgroundColor": "#d0bfff", + "width": 63.75, + "height": 45, + "seed": 782762477, + "groupIds": [ + "CuM_sg3LC9KTYRVST18pX" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726188516836, + "link": null, + "locked": false + }, + { + "type": "diamond", + "version": 117, + "versionNonce": 224511779, + "index": "b5w", + "isDeleted": false, + "id": "uOIVUAj_hGKNiZ3NnQm2n", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 961.9204545454546, + "y": 564.5113636363637, + "strokeColor": "#1e1e1e", + "backgroundColor": "#d0bfff", + "width": 63.75, + "height": 45, + "seed": 1245990083, + "groupIds": [ + "CuM_sg3LC9KTYRVST18pX" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726188516836, + "link": null, + "locked": false + }, + { + "type": "diamond", + "version": 122, + "versionNonce": 1205596301, + "index": "b5x", + "isDeleted": false, + "id": "ylh6O0GmjhRAHndHyuEo2", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 966.9204545454546, + "y": 615.7613636363637, + "strokeColor": "#1e1e1e", + "backgroundColor": "#d0bfff", + "width": 63.75, + "height": 45, + "seed": 499397773, + "groupIds": [ + "CuM_sg3LC9KTYRVST18pX" + ], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726188516836, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 260, + "versionNonce": 1136192621, + "index": "b5y", + "isDeleted": false, + "id": "ekXIjXxtZ6f2w_A-9CVUV", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 938.2855058149859, + "y": 670.7613636363637, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 107.5399169921875, + "height": 25, + "seed": 1616985635, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726188507123, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "embeddings", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "embeddings", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 381, + "versionNonce": 1618061620, + "index": "b5z", + "isDeleted": false, + "id": "Uv-8TiLeECJuuNx1yJjtv", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 768.5454545454545, + "y": 280.72727272727275, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 637818278, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "0wYqjwjKHCGbx7CfmDR__", + "type": "arrow" + }, + { + "type": "text", + "id": "B8Nj-HzRDl-FA-5UJ2hiw" + } + ], + "updated": 1726708776347, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 140, + "versionNonce": 1472181260, + "index": "b60", + "isDeleted": false, + "id": "B8Nj-HzRDl-FA-5UJ2hiw", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 783.2418233698064, + "y": 285.72727272727275, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 17.879989624023438, + "height": 25, + "seed": 1971906541, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708776347, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A'", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Uv-8TiLeECJuuNx1yJjtv", + "originalText": "A'", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 391, + "versionNonce": 1280205492, + "index": "b61", + "isDeleted": false, + "id": "l7XMM15Xwzq5xmDF0QvyN", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 764.090909090909, + "y": 186.09090909090912, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1556091898, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "SZp9x_uNQ-65LQPMQ768C" + } + ], + "updated": 1726708776347, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 132, + "versionNonce": 809849484, + "index": "b62", + "isDeleted": false, + "id": "SZp9x_uNQ-65LQPMQ768C", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 780.9672782204367, + "y": 191.09090909090912, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 13.519989013671875, + "height": 25, + "seed": 912377443, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708776347, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "l7XMM15Xwzq5xmDF0QvyN", + "originalText": "A", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 413, + "versionNonce": 1599597620, + "index": "b63", + "isDeleted": false, + "id": "Wxv71stEiYRpNjyhzzXgO", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 767.1818181818182, + "y": 234.27272727272725, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 775085434, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "id": "0wYqjwjKHCGbx7CfmDR__", + "type": "arrow" + }, + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" + }, + { + "type": "text", + "id": "zyU1230-bmsHaQTSoi7Ov" + } + ], + "updated": 1726708776347, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 102, + "versionNonce": 1402151180, + "index": "b64", + "isDeleted": false, + "id": "zyU1230-bmsHaQTSoi7Ov", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 783.2081888372248, + "y": 239.27272727272725, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 1842733667, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708776347, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Wxv71stEiYRpNjyhzzXgO", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 397, + "versionNonce": 997475764, + "index": "b65", + "isDeleted": false, + "id": "IkaeA2i4mlTdmulYEI_na", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 771.3636363636363, + "y": 325.3636363636364, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1839286010, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "IgKDOIQhfqb_x9gQh30eh" + } + ], + "updated": 1726708776347, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 89, + "versionNonce": 421732236, + "index": "b66", + "isDeleted": false, + "id": "IgKDOIQhfqb_x9gQh30eh", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 787.3900070190429, + "y": 330.3636363636364, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 1893385699, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708776347, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "IkaeA2i4mlTdmulYEI_na", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 440, + "versionNonce": 1439264564, + "index": "b67", + "isDeleted": false, + "id": "qGfihx9_lQSyc1F8oQTu0", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 772.909090909091, + "y": 369.01136363636374, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1381062179, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "0DIl-np94wHje4sIubFJp" + } + ], + "updated": 1726708776347, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 133, + "versionNonce": 1496272396, + "index": "b68", + "isDeleted": false, + "id": "0DIl-np94wHje4sIubFJp", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 790.2554612593218, + "y": 374.01136363636374, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 12.579986572265625, + "height": 25, + "seed": 1722325443, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708776347, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "C", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "qGfihx9_lQSyc1F8oQTu0", + "originalText": "C", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 70, + "versionNonce": 247294132, + "index": "b69", + "isDeleted": false, + "id": "lkM4ke2d8E4KSisX5yE08", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 762.5454545454546, + "y": 429.51136363636374, + "strokeColor": "#1e1e1e", + "backgroundColor": "#d0bfff", + "width": 64.55995178222656, + "height": 25, + "seed": 1905848653, + "groupIds": [ + "wECUsJGvuBUaz0aXhNgT4" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708776347, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "chunks", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "chunks", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 527, + "versionNonce": 1269467404, + "index": "b698", + "isDeleted": false, + "id": "JNHVvikjirDDllCKotbJC", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1025.9545454545455, + "y": 275.68750000000006, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 848769955, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "8Msc7tXcZdg2UUH2NmUn-" + } + ], + "updated": 1726708934863, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 287, + "versionNonce": 1779271564, + "index": "b69G", + "isDeleted": false, + "id": "8Msc7tXcZdg2UUH2NmUn-", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1040.6509142788973, + "y": 280.68750000000006, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 17.879989624023438, + "height": 25, + "seed": 1297532739, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A'", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "JNHVvikjirDDllCKotbJC", + "originalText": "A'", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 565, + "versionNonce": 1888269836, + "index": "b69O", + "isDeleted": false, + "id": "fkbHGW5tJ-Ay0sh8h-9hJ", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1022.5, + "y": 182.05113636363643, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 2116216547, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "BNiP4zX7PtFTn_e_5vXX3" + } + ], + "updated": 1726708934863, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 308, + "versionNonce": 1814172812, + "index": "b69V", + "isDeleted": false, + "id": "BNiP4zX7PtFTn_e_5vXX3", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1039.3763691295276, + "y": 187.05113636363643, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 13.519989013671875, + "height": 25, + "seed": 1804210819, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "fkbHGW5tJ-Ay0sh8h-9hJ", + "originalText": "A", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 558, + "versionNonce": 981967628, + "index": "b69d", + "isDeleted": false, + "id": "QYKbNgibs7-HxaNNr8tfG", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1024.590909090909, + "y": 229.23295454545456, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1716177443, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "C-rwFmAbwI_qgVqpkXy7m" + } + ], + "updated": 1726708934863, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 249, + "versionNonce": 1916232076, + "index": "b69l", + "isDeleted": false, + "id": "C-rwFmAbwI_qgVqpkXy7m", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1040.6172797463155, + "y": 234.23295454545456, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 592678339, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "QYKbNgibs7-HxaNNr8tfG", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 653, + "versionNonce": 1248546828, + "index": "b69t", + "isDeleted": false, + "id": "m2Wj9fp76PKCAhrulCmTa", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1027.318181818182, + "y": 365.97159090909105, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 901963107, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "MNgTOO1UYazXucNSjXZ_z" + } + ], + "updated": 1726708934863, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 348, + "versionNonce": 52260492, + "index": "b6A", + "isDeleted": false, + "id": "MNgTOO1UYazXucNSjXZ_z", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1044.6645521684127, + "y": 370.97159090909105, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 12.579986572265625, + "height": 25, + "seed": 1223112963, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "C", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "m2Wj9fp76PKCAhrulCmTa", + "originalText": "C", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 127, + "versionNonce": 1292352780, + "index": "b6AG", + "isDeleted": false, + "id": "J1KVE_C00rdGo7FWIwu1X", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 998.7954545454545, + "y": 188.01136363636374, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 1442121325, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 181, + "versionNonce": 832846732, + "index": "b6AV", + "isDeleted": false, + "id": "TIEDsM4QhNNDJARAJnvDz", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1001.7954545454545, + "y": 234.26136363636374, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 846611715, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "2", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 229, + "versionNonce": 2066541068, + "index": "b6Al", + "isDeleted": false, + "id": "tGvqUuD_kCzfMYn-UX8o-", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1004.2954545454545, + "y": 283.01136363636374, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 758667053, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "3", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "3", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 360, + "versionNonce": 479971468, + "index": "b6B", + "isDeleted": false, + "id": "IQM8OVr381UGBDKQtda8U", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1004.0454545454545, + "y": 371.26136363636374, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 618433805, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "5", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 611, + "versionNonce": 430626572, + "index": "b6BV", + "isDeleted": false, + "id": "fJGd6Pf-SaTmbDMUGHhUW", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1028.3972327492456, + "y": 322.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1491526540, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "Ax-8fSsrXvrkMhlGAgJgO" + } + ], + "updated": 1726708934863, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 302, + "versionNonce": 1859392908, + "index": "b6C", + "isDeleted": false, + "id": "Ax-8fSsrXvrkMhlGAgJgO", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1044.423603404652, + "y": 327.2812500000001, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 1943704076, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "fJGd6Pf-SaTmbDMUGHhUW", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 259, + "versionNonce": 2035385356, + "index": "b6CV", + "isDeleted": false, + "id": "07qZABiLS71UbigBsFpnK", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1002.0335963856091, + "y": 327.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 1965424820, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708934863, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "4", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "4", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "arrow", + "version": 2600, + "versionNonce": 1259679372, + "index": "b6D", + "isDeleted": false, + "id": "M_WCuesgPRdSQ_zqaUtz0", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1113.5321305627851, + "y": 279.97561555378826, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "width": 154.2895204048931, + "height": 2.3372664247598323, + "seed": 1489010356, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1726708895234, + "link": null, + "locked": false, + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 154.2895204048931, + 2.3372664247598323 + ] + ] + }, + { + "type": "text", + "version": 176, + "versionNonce": 14571020, + "index": "b6E", + "isDeleted": false, + "id": "wkavhEPwz2TNGwf8xFeLA", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1263.0335963856091, + "y": 188.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 809955212, + "groupIds": [ + "uHtPh4-PiLJtgc-p_Cdgo" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708942969, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 538, + "versionNonce": 1071049484, + "index": "b6F", + "isDeleted": false, + "id": "Qaz1byDgzm-0ZrVLBmU4v", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1288.9545454545455, + "y": 273.1875000000001, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 144156909, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "D2HbgzHXdGyxGppwaWbBy" + } + ], + "updated": 1726708966705, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 296, + "versionNonce": 2108300212, + "index": "b6G", + "isDeleted": false, + "id": "D2HbgzHXdGyxGppwaWbBy", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1303.6509142788973, + "y": 278.1875000000001, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 17.879989624023438, + "height": 25, + "seed": 2062418765, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708966705, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A'", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Qaz1byDgzm-0ZrVLBmU4v", + "originalText": "A'", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 569, + "versionNonce": 509454732, + "index": "b6H", + "isDeleted": false, + "id": "-LxVJeZLqj0MgI5FEg_pm", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1281.5, + "y": 179.55113636363643, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1514803629, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "trFDjiJr6cfNlCSEKqNjE" + } + ], + "updated": 1726708966705, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 311, + "versionNonce": 1054115124, + "index": "b6I", + "isDeleted": false, + "id": "trFDjiJr6cfNlCSEKqNjE", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1298.3763691295276, + "y": 184.55113636363643, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 13.519989013671875, + "height": 25, + "seed": 1674925069, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708966705, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "-LxVJeZLqj0MgI5FEg_pm", + "originalText": "A", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 566, + "versionNonce": 713594892, + "index": "b6J", + "isDeleted": false, + "id": "Kxu9owye4gMpRvh7kJ1Nl", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1287.590909090909, + "y": 226.73295454545456, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1938377325, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "UP92rSYiIXnnBFhov6WNx" + } + ], + "updated": 1726708966705, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 256, + "versionNonce": 301317812, + "index": "b6K", + "isDeleted": false, + "id": "UP92rSYiIXnnBFhov6WNx", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1303.6172797463157, + "y": 231.73295454545456, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 707753165, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708966705, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Kxu9owye4gMpRvh7kJ1Nl", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 593, + "versionNonce": 5355148, + "index": "b6L", + "isDeleted": false, + "id": "KMOsOR4pOx-ute2ztnw1k", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1293.318181818182, + "y": 361.4715909090911, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 635317229, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "SsRO-f6mzQzf5jQOudz6C" + } + ], + "updated": 1726708966705, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 287, + "versionNonce": 800311348, + "index": "b6M", + "isDeleted": false, + "id": "SsRO-f6mzQzf5jQOudz6C", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1310.6645521684127, + "y": 366.4715909090911, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 12.579986572265625, + "height": 25, + "seed": 1382819405, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708966705, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "C", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "KMOsOR4pOx-ute2ztnw1k", + "originalText": "C", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 206, + "versionNonce": 745735436, + "index": "b6N", + "isDeleted": false, + "id": "US1PK13ekocRlMvOrHSJL", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1265.0335963856091, + "y": 231.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 1525760780, + "groupIds": [ + "bQ__H1TgpJXskAm32UBLZ", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708966705, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "2", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 241, + "versionNonce": 1274323380, + "index": "b6O", + "isDeleted": false, + "id": "NxUqy-MsYDga_9XDrU9l7", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1267.5335963856091, + "y": 277.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 1116920372, + "groupIds": [ + "4mN8vM1PMjtKHfzWdqXES", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708966705, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "3", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "3", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 240, + "versionNonce": 342262668, + "index": "b6P", + "isDeleted": false, + "id": "lSEPKkiY8if2M9pDun8DS", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1270.5335963856091, + "y": 370.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 932194828, + "groupIds": [ + "Z8bVLPerSCYHViV4Ld1Ed", + "XEHMHITFJTjudNYgVFCPu" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1726708966705, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "5", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", + "autoResize": true, + "lineHeight": 1.25 + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": { + "83ba3062a1490699e3ccc129acb25b1f4ec5534d": { + "mimeType": "image/png", + "id": "83ba3062a1490699e3ccc129acb25b1f4ec5534d", + "dataURL": "", + "created": 1711006482453, + "lastRetrieved": 1726708752969 + }, + "fffa228d79e3bc7053142e0031890d5aaf369b8a": { + "mimeType": "image/png", + "id": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "dataURL": "", + "created": 1721376622438, + "lastRetrieved": 1726708752969 + } + } +} \ No newline at end of file diff --git a/examples/notebooks/intro/images/data-prep-kit-3-workflow.png b/examples/notebooks/intro/images/data-prep-kit-3-workflow.png new file mode 100644 index 000000000..851adbfeb Binary files /dev/null and b/examples/notebooks/intro/images/data-prep-kit-3-workflow.png differ diff --git a/examples/notebooks/intro/input/solar-system/earth.md b/examples/notebooks/intro/input/solar-system/earth.md new file mode 100644 index 000000000..9e086dae2 --- /dev/null +++ b/examples/notebooks/intro/input/solar-system/earth.md @@ -0,0 +1,17 @@ +# Earth + +## Solar System + +Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun. + +For more details about our Solar system see Chapter 1. + +## Earth + +Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life. + +Basic facts about Earth: + +- Distance from the Sun: Average of 149.6 million kilometers (93 million miles) +- Rotation Period: 24 hours (one day) +- Moons: One moon, called Luna or simply "the Moon". \ No newline at end of file diff --git a/examples/notebooks/intro/input/solar-system/earth.pdf b/examples/notebooks/intro/input/solar-system/earth.pdf new file mode 100644 index 000000000..b6bc7edc8 Binary files /dev/null and b/examples/notebooks/intro/input/solar-system/earth.pdf differ diff --git a/examples/notebooks/intro/input/solar-system/mars.md b/examples/notebooks/intro/input/solar-system/mars.md new file mode 100644 index 000000000..f28fc1a30 --- /dev/null +++ b/examples/notebooks/intro/input/solar-system/mars.md @@ -0,0 +1,17 @@ +# Mars + +## Solar System + +Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun. + +For more details about the Solar system see Chapter 1. + +## Mars + +Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface. + +Basic facts about Mars: + +- Distance from the Sun: Average of 228 million kilometers (142 million miles) +- Rotation Period: 24.6 hours (one Martian day - called a "sol") +- Moons: Two small moons, Phobos and Deimos. \ No newline at end of file diff --git a/examples/notebooks/intro/input/solar-system/mars.pdf b/examples/notebooks/intro/input/solar-system/mars.pdf new file mode 100644 index 000000000..a48c4365b Binary files /dev/null and b/examples/notebooks/intro/input/solar-system/mars.pdf differ diff --git a/examples/notebooks/intro/my_utils.py b/examples/notebooks/intro/my_utils.py new file mode 100644 index 000000000..9a6477dfc --- /dev/null +++ b/examples/notebooks/intro/my_utils.py @@ -0,0 +1,55 @@ +import os +import requests +from humanfriendly import format_size +import pandas as pd +import glob + + +## Reads parquet files in a folder into a pandas dataframe +def read_parquet_files_as_df (parquet_dir): + parquet_files = glob.glob(f'{parquet_dir}/*.parquet') + + # read each parquet file into a DataFrame and store in a list + dfs = [pd.read_parquet (f) for f in parquet_files] + + # Concatenate all DataFrames into a single DataFrame + data_df = pd.concat(dfs, ignore_index=True) + return data_df + + +def download_file(url, local_file, chunk_size=1024*1024): + """ + Downloads a remote URL to a local file. + + Args: + url (str): The remote URL. + local_filename (str): The name of the local file to save the downloaded content. + chunk_size (int): The size in bytes of each chunk. Defaults to 1024. + + Returns: + None + + Example usage: + download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB + """ + # Check if the local file already exists + if os.path.exists(local_file): + file_size = format_size(os.path.getsize(local_file)) + print(f"Local file '{local_file}' ({file_size}) already exists. Skipping download.") + return + + # Create the directory if it doesn't exist + os.makedirs(os.path.dirname(local_file), exist_ok=True) + + # Stream the file download + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(local_file, 'wb') as f: + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + print() + file_size = format_size(os.path.getsize(local_file)) + print(f"{local_file} ({file_size}) downloaded successfully.") +## --- end: download_file ------ + diff --git a/transforms/code/code_profiler/README.md b/transforms/code/code_profiler/README.md index 6eeed674f..691f6ff4b 100644 --- a/transforms/code/code_profiler/README.md +++ b/transforms/code/code_profiler/README.md @@ -1,6 +1,6 @@ -# Code Profiler Tranform +# Code Profiler Transform -This module extracts the base syntactic concepts from the multi-language source codes and represent these concepts in an unified langauge-agnostic representation that can be further used for multi-lnaguage data profiling. While programming languages expose similar syntactic building blocks to represent programming intent, such as importing packages/libraries, functions, classes, loops, conditionals, comments and others, these concepts are expressed through language-specific grammar, defined by distinct keywords and syntactic form. Our framework abstracts language-specific concepts by transforming them into a unified, language-agnostic representation called universal base syntactic representation (UBSR), referred to as a concept, which is consistently encoded within the proposed schema structure. The current version support the base syntactic concept for importing/including package/libraries, comments, functions. +This module extracts the base syntactic concepts from the multi-language source codes and represent these concepts in a unified langauge-agnostic representation that can be further used for multi-language data profiling. While programming languages expose similar syntactic building blocks to represent programming intent, such as importing packages/libraries, functions, classes, loops, conditionals, comments and others, these concepts are expressed through language-specific grammar, defined by distinct keywords and syntactic form. Our framework abstracts language-specific concepts by transforming them into a unified, language-agnostic representation called universal base syntactic representation (UBSR), referred to as a concept, which is consistently encoded within the proposed schema structure. The current version supports the base syntactic concept for importing/including package/libraries, comments, functions. Table 1 outlines the fields of the UBSR, which maps AST nodes to a structured schema. This schema captures syntactic nodes (based on AST node types) and the relationships between those nodes (derived from AST edges). The UBSR framework currently supports 21 languages, grouped according to their syntactic paradigms. @@ -48,7 +48,7 @@ implementation. **Offline Path for Syntactic Rule Generation** -The offline path is critical for expanding and refining the syntactic rule database, enabling the USR framework to adapt to new languages and syntactic constructs. This process leverages LLMs to generate syntactic rules for languages that are not yet included in the rule database. To achieve this, we utilize a Few-shot Chain of Thought prompting technique, guiding the LLM through a step-by-step rule generation process. By providing carefully curated training exemplars and detailed instructions, this method ensures the LLM can accurately generalize from these examples to produce effective syntactic rules for a wide range of languages. This structured approach enhances the flexibility of the UBSR framework, allowing it to seamlessly handle evolving language constructs. +The offline path is critical for expanding and refining the syntactic rule database, enabling the UBSR framework to adapt to new languages and syntactic constructs. This process leverages LLMs to generate syntactic rules for languages that are not yet included in the rule database. To achieve this, we utilize a Few-shot Chain of Thought prompting technique, guiding the LLM through a step-by-step rule generation process. By providing carefully curated training exemplars and detailed instructions, this method ensures the LLM can accurately generalize from these examples to produce effective syntactic rules for a wide range of languages. This structured approach enhances the flexibility of the UBSR framework, allowing it to seamlessly handle evolving language constructs. The implementation for UI-based offline customization tool is present [here](python/src/offline-customizations). To run the tool, use the following command. @@ -60,4 +60,4 @@ The high-level system design is as follows: For each new target language, the offline phase is utilized to create deterministic rules by harnessing the capabilities of LLMs and working with exemplar code samples from the target language. In this process, Workflow W1 facilitates the creation of rules around syntactic structures based on exemplar code samples, while Workflow W2 is used to establish semantic dimensions for profiling. Subsequently, we derive rules that connect syntactic constructs to the predefined semantic concepts. These rules are then stored in a rule database, ready to be employed during the online phase. -In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports. \ No newline at end of file +In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports. diff --git a/transforms/code/header_cleanser/python/src/header_cleanser_transform.py b/transforms/code/header_cleanser/python/src/header_cleanser_transform.py index 43ac0f2b6..00fa3c892 100644 --- a/transforms/code/header_cleanser/python/src/header_cleanser_transform.py +++ b/transforms/code/header_cleanser/python/src/header_cleanser_transform.py @@ -95,6 +95,7 @@ def check_empty_comment(code, ignore_lines): if max_index <= len(code_list): max_index = max_index + 2 + max_index = min(max_index, len(code_list)) for index in range(min_index, max_index): if all( diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 6c14abfd6..38a829fab 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -68,15 +68,11 @@ def compute_exec_params_func( "repo_lvl_store_ray_cpus": repo_lvl_store_ray_cpus, "repo_lvl_store_ray_nworkers": repo_lvl_store_ray_nworkers, "repo_lvl_sorting_algo": repo_lvl_sorting_algo, + "repo_lvl_stage_one_only": repo_lvl_stage_one_only, + "repo_lvl_sorting_enabled": repo_lvl_sorting_enabled, + "repo_lvl_output_by_langs": repo_lvl_output_by_langs, + "repo_lvl_combine_rows": repo_lvl_combine_rows, } - if repo_lvl_stage_one_only == True: - res["repo_lvl_stage_one_only"] = "" - if repo_lvl_sorting_enabled == True: - res["repo_lvl_sorting_enabled"] = "" - if repo_lvl_output_by_langs == True: - res["repo_lvl_output_by_langs"] = "" - if repo_lvl_combine_rows == True: - res["repo_lvl_combine_rows"] = "" return res diff --git a/transforms/code/repo_level_ordering/ray/README.md b/transforms/code/repo_level_ordering/ray/README.md index 42e68f13f..84367636e 100644 --- a/transforms/code/repo_level_ordering/ray/README.md +++ b/transforms/code/repo_level_ordering/ray/README.md @@ -7,14 +7,31 @@ testing and IDE set up. ## Summary +This transform does repository level packing of data and arranging them to prioritise semantic dependancies. This +was done to prepare long context data for [Scaling Granite Code Models to 128K Context](https://arxiv.org/pdf/2407.13739) +. Quoting the paper. + +>To create long-context data, we develop a new approach that packs files from the same +repository together, arranging them to prioritize semantic dependencies. We identify these +dependencies by analyzing file imports and create a directed acyclic graph, where each +file is a node and edges represent API imports between files. After breaking any cycles +in the graph, we perform a topological sort to establish an ordering of files based on their +semantic dependencies. We then organize the files in a repository by placing documentation +and build files first, followed by the ordered set of files with semantic dependencies, and +finally the remaining non-connected files. These non-connected files are arranged according +to their folder structure, using a depth-first search to traverse the repository. Finally, we +determine the dominant programming language of a repository based on file extensions +and presence of build files, to organise repo-ordered files by programming languages + + This transform can group the data by `repo_name` and apply additional transformations like( sorting or output_by_language or combining rows) on the grouped data. This transform requires the input data to have at least the following columns: -- repo name: Name of the repo, it is used for grouping in this transform. +- **repo name**: Name of the repo, it is used for grouping in this transform. -- title : Which is usually file path. +- **title** : Which is usually file path. -- language: Programming language of content +- **language**: Programming language of content The input data for this transform should be in parquet format. The input data is expected to have code data arranged in rows such that each row represents a file. The required columns in the input data shoud correspond to a) repository name b) file path @@ -151,10 +168,11 @@ python src/repo_level_order_transform_ray.py \ --run_locally True \ --data_s3_cred "$s3_kreds" \ --data_s3_config "$s3_conf" \ - --repo_lvl_store_type local \ - --repo_lvl_store_backend_dir '/tmp/mystore' \ + --repo_lvl_store_type ray \ --repo_lvl_combine_rows True\ --repo_lvl_sorting_enabled True\ + --repo_lvl_store_ray_cpus 0.2 \ + --repo_lvl_store_ray_nworkers 1 \ --repo_lvl_sorting_algo SORT_SEMANTIC \ --repo_lvl_output_by_langs True ``` diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py index c33637451..1e9a24993 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py @@ -15,6 +15,7 @@ import uuid from typing import Callable +import pandas as pd import pyarrow as pa from dpk_repo_level_order.internal.check_languages import ( get_dominant_language_repo_packing, @@ -32,26 +33,47 @@ SORT_SEMANTIC_NORMALISED = "SORT_SEMANTIC_NORMALISED" -def semantic_sort(df, logger, title_column_name, language_column_name): +def semantic_sort( + df: pd.DataFrame, logger: logging.Logger, title_column_name: str, language_column_name: str +) -> pd.DataFrame: return sort_sem( files_df=df, logger=logger, title_column_name=title_column_name, language_column_name=language_column_name ) -def semantic_sort_normalised(df, logger, title_column_name, language_column_name): +def semantic_sort_normalised( + df: pd.DataFrame, logger: logging.Logger, title_column_name: str, language_column_name: str +) -> pd.DataFrame: check_and_update_title(df) return sort_sem( files_df=df, logger=logger, title_column_name=title_column_name, language_column_name=language_column_name ) -def default_sort(df, logger, title_column_name, language_column_name): +def default_sort( + df: pd.DataFrame, logger: logging.Logger, title_column_name: str, language_column_name: str +) -> pd.DataFrame: return sort_by_path(df=df, logger=logger, title_column_name=title_column_name) def get_sorting_func( sorting_algo: str, title_column_name: str, logger: logging.Logger, language_column_name: str ) -> Callable[[pa.Table], pa.Table]: + """Get a sorting function based on the specified algorithm. + + Args: + sorting_algo (str): The sorting algorithm to use. + title_column_name (str): The name of the column containing file + titles. + logger (logging.Logger): A logger object for logging messages. + language_column_name (str): The name of the column containing file + languages. + + Returns: + Callable[[pa.Table, str], pa.Table]: A function that takes a PyArrow Table + and a file name as input and + returns a sorted PyArrow Table. + """ if sorting_algo == SORT_SEMANTIC: sort_by = semantic_sort logger.info("semantic sort enabled") @@ -86,7 +108,26 @@ def sorter(table: pa.Table, file_name: str) -> pa.Table: return sorter -def get_dominant_language_func(language_column_name, title_column_name): +def get_dominant_language_func(language_column_name: str, title_column_name: str) -> Callable[[pa.Table, str], str]: + """ + This function takes two column names as input and returns a function + that can be applied to a pyarrow table. + The returned function determines the dominant programming language in + the pyarrow table and returns the filename with the detected language + prepended. + + Args: + language_column_name (str): Name of the column containing the + programming languages. + title_column_name (str): Name of the column containing the file + titles/paths. + + Returns: + Callable[[pa.Table, str], str]: A function that takes a table as + input and returns a new table with the filenames modified to include the + detected dominant language. + """ + def dominant_lang_per_repo(table: pa.Table, filename: str) -> str: """ This function takes a table whose rows are documents from a repo @@ -149,6 +190,28 @@ def lang_distribution(grouping_column): def get_transforming_func(sorting_func=None, superrows_func=None, filename_func=None, language_column_name="language"): + """ + This function takes three optional functions as input and returns a + function that can be applied to a pyarrow table and file name. + The returned function performs some transformation on the input table + and file name based on the provided functions. + + Args: + sorting_func (Callable[[pa.Table, str], pa.Table]): A function that sorts the + rows of a table based on a column. Defaults to None. + superrows_func (Callable[[pa.Table, str, str], pa.Table]): A + function that creates new rows in a table based on the values of other + columns. Defaults to None. + filename_func (Callable[[pa.Table, str], str]): A function that modifies the + file name. Defaults to None. + language_column_name (str): The name of the column containing the + programming languages. Defaults to "language". + + Returns: + callable: A function that takes a table and file name as input and + returns a list of transformed tables and file names. + """ + def my_transform(table, file_name): out_table = table if sorting_func: diff --git a/transforms/code/repo_level_ordering/ray/src/repo_level_order_s3_ray.py b/transforms/code/repo_level_ordering/ray/src/repo_level_order_s3_ray.py index fb42b6b81..4d65abb76 100644 --- a/transforms/code/repo_level_ordering/ray/src/repo_level_order_s3_ray.py +++ b/transforms/code/repo_level_ordering/ray/src/repo_level_order_s3_ray.py @@ -49,15 +49,14 @@ } repo_level_params = { + "repo_lvl_sorting_enabled": True, "repo_lvl_sorting_algo": "SORT_SEMANTIC", "repo_lvl_store_type": "ray", + "repo_lvl_output_by_langs": True, + "repo_lvl_combine_rows": True, } -repo_level_flags = ["repo_lvl_output_by_langs", "repo_lvl_combine_rows", "repo_lvl_sorting_enabled"] - -d = ParamsUtils.dict_to_req(d=params | repo_level_params) -sys.argv = d + [f"--{flag}" for flag in repo_level_flags] -sys.argv = ParamsUtils.dict_to_req(d=params) +sys.argv = ParamsUtils.dict_to_req(d=params | repo_level_params) # for arg in sys.argv: # print(arg) diff --git a/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py b/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py index b3152c44b..a43feda87 100644 --- a/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py +++ b/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py @@ -18,7 +18,7 @@ import pyarrow as pa from data_processing.data_access import DataAccessFactoryBase from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing.utils import CLIArgumentProvider, get_logger, str2bool from data_processing_ray.runtime.ray import DefaultRayTransformRuntime, RayUtils from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, @@ -27,6 +27,7 @@ create_store, create_store_params, init_store_params, + store_type_value_ray, validate_store_params, ) from ray.actor import ActorHandle @@ -108,6 +109,7 @@ def __init__(self, config: dict[str, Any]): self.grouping_column = config.get(grouping_column_key, repo_column_default_value) store_params = config.get(store_params_key) validate_store_params(store_params) + self.store_type = store_params[store_type_key] self.store = create_store(store_params) self.group_batch_size = group_batch_size @@ -126,6 +128,16 @@ def _create_batches(self, data, batch_size=1): batches.append(batch) return batches + def _normalize_file_name_for_store(self, file_name): + if self.store_type == store_type_value_ray: + # we can store full file_name consiting of full path in this store. + return file_name + else: + # since this store type uses filesystem as backend + # can't store full path in store since, + # store is currently flat filesystem. + return os.path.basename(file_name) + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ This step is used to do groupby with respect to `self.grouping_column` and update @@ -145,11 +157,8 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab grp_flow = {} for group in batch: # This supports only flat folder structure, so all - # files should be in the same folder - # since store uses filesystem as backend - # can't store full path in store since, - # store is currently flat filesystem. - file_name = os.path.basename(file_name) + # files should be in the same folder. + file_name = self._normalize_file_name_for_store(file_name) grp_flow[group] = file_name self.logger.debug(f"Updating {group} to store") @@ -286,10 +295,15 @@ def _prepare_mapper_function(self): def _prepare_inputs(self): store = create_store(self.store_params) - files_location = self.input_folder + store_type = self.store_params[store_type_key] + p_input = [] for repo, files in store.items_kv(): - p_input.append((repo, [f"{files_location}/{file}" for file in files])) + if store_type == store_type_value_ray: + p_input.append((repo, [f"{file}" for file in files])) + else: + files_location = self.input_folder + p_input.append((repo, [f"{files_location}/{file}" for file in files])) return p_input def _group_and_sort(self): @@ -361,8 +375,8 @@ def add_input_params(self, parser: ArgumentParser) -> None: # See below for remove_from_metadata addition so that it is not reported. parser.add_argument( f"--{cli_prefix}{stage_one_only_key}", - action="store_true", - help="If this flag is set, transform only builds the repo grouping and doesn't write output", + type=lambda x: bool(str2bool(x)), + help="If this flag is True, transform only builds the repo grouping and doesn't write output", ) parser.add_argument( f"--{cli_prefix}{grouping_column_key}", @@ -402,7 +416,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: parser.add_argument( f"--{cli_prefix}{sorting_enable_key}", default=sort_enable_default, - type=bool, + type=lambda x: bool(str2bool(x)), help=f"Enables sorting of output by algorithm specified using {cli_prefix}{sorting_algo_key}. Defaults to SORT_BY_PATH if no algorithm is specified.", ) parser.add_argument( @@ -413,13 +427,13 @@ def add_input_params(self, parser: ArgumentParser) -> None: ) parser.add_argument( f"--{cli_prefix}{output_by_langs_key}", - type=bool, + type=lambda x: bool(str2bool(x)), default=output_by_lang_default, help="If specified, output is grouped into programming language folders.", ) parser.add_argument( f"--{cli_prefix}{output_superrows_key}", - type=bool, + type=lambda x: bool(str2bool(x)), default=superrows_default, help="If specified, output rows per repo are combined to form a single repo", ) diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index f962717d6..fbacf4ade 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -4,7 +4,7 @@ This transform is chunking documents. It supports multiple _chunker modules_ (se When using documents converted to JSON, the transform leverages the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc. -It relies on documents converted with the Docling library in the [pdf2parquet transform](../pdf2parquet) using the option `contents_type: "application/json"`, +It relies on documents converted with the Docling library in the [pdf2parquet transform](../../pdf2parquet/python/README.md) using the option `contents_type: "application/json"`, which provides the required JSON structure. When using documents converted to Markdown, the transform leverages the [Llama Index](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) `MarkdownNodeParser`, which is relying on its internal Markdown splitting logic. diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index d532510ba..2db4bd1f1 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,3 +1,3 @@ data-prep-toolkit==0.2.2.dev1 -docling-core==1.3.0 +docling-core==1.7.2 llama-index-core>=0.11.0,<0.12.0 diff --git a/transforms/language/doc_chunk/python/test-data/expected/metadata.json b/transforms/language/doc_chunk/python/test-data/expected/metadata.json index f9658c2d8..7eeaaa279 100644 --- a/transforms/language/doc_chunk/python/test-data/expected/metadata.json +++ b/transforms/language/doc_chunk/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "doc_chunk", "job type": "pure python", "job id": "job_id", - "start_time": "2024-09-18 16:05:04", - "end_time": "2024-09-18 16:05:04", + "start_time": "2024-10-18 14:05:09", + "end_time": "2024-10-18 14:05:11", "status": "success" }, "code": { @@ -24,6 +24,8 @@ "output_jsonpath_column_name": "doc_jsonpath", "output_pageno_column_name": "page_number", "output_bbox_column_name": "bbox", + "chunk_size_tokens": 128, + "chunk_overlap_tokens": 30, "checkpointing": false, "max_files": -1, "random_samples": -1, @@ -32,12 +34,19 @@ ], "num_processors": 0 }, + "execution_stats": { + "cpus": 27.9, + "gpus": 0, + "memory": 25.75, + "object_store": 0, + "execution time, min": 0.021 + }, "job_output_stats": { "source_files": 1, "source_size": 50276, "result_files": 1, - "result_size": 31246, - "processing_time": 0.071, + "result_size": 31223, + "processing_time": 1.266, "nfiles": 1, "nrows": 88, "source_doc_count": 1, diff --git a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet index 607bbd213..06089be78 100644 Binary files a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet and b/transforms/language/doc_chunk/python/test-data/expected/test1.parquet differ diff --git a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json index f9658c2d8..7eeaaa279 100644 --- a/transforms/language/doc_chunk/ray/test-data/expected/metadata.json +++ b/transforms/language/doc_chunk/ray/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "doc_chunk", "job type": "pure python", "job id": "job_id", - "start_time": "2024-09-18 16:05:04", - "end_time": "2024-09-18 16:05:04", + "start_time": "2024-10-18 14:05:09", + "end_time": "2024-10-18 14:05:11", "status": "success" }, "code": { @@ -24,6 +24,8 @@ "output_jsonpath_column_name": "doc_jsonpath", "output_pageno_column_name": "page_number", "output_bbox_column_name": "bbox", + "chunk_size_tokens": 128, + "chunk_overlap_tokens": 30, "checkpointing": false, "max_files": -1, "random_samples": -1, @@ -32,12 +34,19 @@ ], "num_processors": 0 }, + "execution_stats": { + "cpus": 27.9, + "gpus": 0, + "memory": 25.75, + "object_store": 0, + "execution time, min": 0.021 + }, "job_output_stats": { "source_files": 1, "source_size": 50276, "result_files": 1, - "result_size": 31246, - "processing_time": 0.071, + "result_size": 31223, + "processing_time": 1.266, "nfiles": 1, "nrows": 88, "source_doc_count": 1, diff --git a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet index 607bbd213..06089be78 100644 Binary files a/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet and b/transforms/language/doc_chunk/ray/test-data/expected/test1.parquet differ diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index d959b9e38..d90658fc7 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,6 +1,6 @@ data-prep-toolkit==0.2.2.dev1 -docling-core==1.3.0 -docling-ibm-models==1.1.7 -deepsearch-glm==0.21.0 -docling==1.11.0 +docling-core==1.7.2 +docling-ibm-models==2.0.0 +deepsearch-glm==0.22.0 +docling==1.20.0 filetype >=1.2.0, <2.0.0 diff --git a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet index 7757d57bb..9975c3608 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json index 53b96d075..704a86d8e 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-08-22 16:01:48", - "end_time": "2024-08-22 16:02:07", + "start_time": "2024-10-18 06:02:44", + "end_time": "2024-10-18 06:03:04", "status": "success" }, "code": { @@ -29,12 +29,19 @@ ], "num_processors": 0 }, + "execution_stats": { + "cpus": 29.2, + "gpus": 0, + "memory": 29.7, + "object_store": 0, + "execution time, min": 0.329 + }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 31976, - "processing_time": 12.429, + "result_size": 32086, + "processing_time": 5.981, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet index d109baaba..f70a89278 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet index aa1d5f30e..033452371 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json index 5062a297e..a38a938a3 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-09-10 21:28:38", - "end_time": "2024-09-10 21:28:47", + "start_time": "2024-10-18 06:09:35", + "end_time": "2024-10-18 06:09:44", "status": "success" }, "code": { @@ -29,12 +29,19 @@ ], "num_processors": 0 }, + "execution_stats": { + "cpus": 25.3, + "gpus": 0, + "memory": 29.52, + "object_store": 0, + "execution time, min": 0.138 + }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 33316, - "processing_time": 6.048, + "result_size": 33227, + "processing_time": 5.64, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet index 13b798f06..5032919c5 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet index b67a3f5c2..58bcfcf6f 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json index a20db3e30..c276aa899 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-08-22 16:04:27", - "end_time": "2024-08-22 16:04:42", + "start_time": "2024-10-18 06:09:08", + "end_time": "2024-10-18 06:09:12", "status": "success" }, "code": { @@ -29,12 +29,19 @@ ], "num_processors": 0 }, + "execution_stats": { + "cpus": 25.5, + "gpus": 0, + "memory": 27.42, + "object_store": 0, + "execution time, min": 0.066 + }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 28828, - "processing_time": 10.41, + "result_size": 27574, + "processing_time": 3.448, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet index c6ea75b21..52b40288b 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 1577d024f..af70c0354 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,7 +1,7 @@ dpk-pdf2parquet-transform-python==0.2.2.dev1 data-prep-toolkit-ray==0.2.2.dev1 -docling-core==1.3.0 -docling-ibm-models==1.1.7 -deepsearch-glm==0.21.0 -docling==1.11.0 +docling-core==1.7.2 +docling-ibm-models==2.0.0 +deepsearch-glm==0.22.0 +docling==1.20.0 filetype >=1.2.0, <2.0.0 diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet index 7757d57bb..9975c3608 100644 Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json index 53b96d075..704a86d8e 100644 --- a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-08-22 16:01:48", - "end_time": "2024-08-22 16:02:07", + "start_time": "2024-10-18 06:02:44", + "end_time": "2024-10-18 06:03:04", "status": "success" }, "code": { @@ -29,12 +29,19 @@ ], "num_processors": 0 }, + "execution_stats": { + "cpus": 29.2, + "gpus": 0, + "memory": 29.7, + "object_store": 0, + "execution time, min": 0.329 + }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 31976, - "processing_time": 12.429, + "result_size": 32086, + "processing_time": 5.981, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet index 8db50118f..f70a89278 100644 Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet differ