Merge branch 'IBM:dev' into dev-pankaj

IBM · Oct 7, 2024 · 60f1fcb · 60f1fcb
2 parents bd4d196 + d04454e
commit 60f1fcb
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 7 deletions.
diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md
@@ -29,10 +29,12 @@ The transform can be tuned with the following parameters.
 
 | Parameter  | Default  | Description  |
 |------------|----------|--------------|
-| `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
+| `chunking_type`        | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. |
 | `content_column_name`        | `contents` | Name of the column containing the text to be chunked. |
 | `doc_id_column_name`         | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
 | `dl_min_chunk_len`           | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
+| `chunk_size_tokens`          | `128` | Size of the chunk in tokens for the token text chunker. |
+| `chunk_overlap_tokens`       | `30` | Number of tokens overlapping between chunks for the token text chunker. |
 | `output_chunk_column_name`   | `contents` | Column name to store the chunks in the output table. |
 | `output_source_doc_id_column_name`   | `source_document_id` | Column name to store the `doc_id` from the input table. |
 | `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
@@ -11,9 +11,10 @@
 ################################################################################
 
 from abc import ABCMeta, abstractmethod
-from typing import Iterator, Optional
+from typing import Iterator, Optional, Dict, List
 
 from docling_core.types import Document as DLDocument
+from llama_index.core.node_parser.text.token import TokenTextSplitter
 from llama_index.core import Document as LIDocument
 from llama_index.core.node_parser import MarkdownNodeParser
 from docling_core.transforms.chunker import HierarchicalChunker
@@ -66,3 +67,73 @@ def chunk(self, content: str) -> Iterator[dict]:
             yield {
                 self.output_chunk_column_name: node.text,
             }
+
+
+class LITokenTextSplitter(ChunkingExecutor):
+    """
+    A text chunker that leverages Llama Index's token-based text splitter. This splitter breaks input text into 
+    fixed-window chunks, with each chunk measured in tokens rather than characters. 
+
+    The chunking process ensures that each chunk contains a specific number of tokens, and an optional overlap between 
+    chunks (also measured in tokens) can be specified to preserve context between the chunks. 
+
+    Args:
+        output_chunk_column_name (str): Name of the output column containing the text of each chunk.
+        output_chunk_column_id (str): Name of the output column containing the ID of each chunk.
+        chunk_size_tokens (int): Length of each chunk in number of tokens.
+        chunk_overlap_tokens (int): Number of tokens overlapping between consecutive chunks.
+
+    Attributes:
+        output_chunk_column_name (str)
+        output_chunk_column_id (str)
+        chunk_size_tokens (int)
+        chunk_overlap_tokens (int)
+    """
+
+    def __init__(
+        self,
+        output_chunk_column_name: str,
+        output_chunk_column_id: str,
+        chunk_size_tokens: int, 
+        chunk_overlap_tokens: int
+    ):
+        self.output_chunk_column_name = output_chunk_column_name
+        self.output_chunk_column_id = output_chunk_column_id
+        self.chunk_size = chunk_size_tokens
+        self.chunk_overlap = chunk_overlap_tokens
+
+
+    def _chunk_text(self, text: str) -> List[str]:
+        """
+        Internal method to chunk text using TokenTextSplitter.
+
+        Args:
+            text (str): Input text to be chunked.
+
+        Returns:
+            List[str]: List of chunked text.
+        """
+        text_splitter = TokenTextSplitter(
+            chunk_size=self.chunk_size, 
+            chunk_overlap=self.chunk_overlap
+        )
+        return text_splitter.split_text(text)
+
+
+    def chunk(self, text: str) -> Iterator[Dict]:
+        """
+        Chunks input text into fixed-window lengths with token overlap.
+
+        Args:
+            text (str): Input text to be chunked.
+
+        Yields:
+            Dict: Chunked text with ID.
+        """
+        chunk_id = 0
+        for chunk in self._chunk_text(text):
+            yield {
+                self.output_chunk_column_id: chunk_id,
+                self.output_chunk_column_name: chunk,
+            }
+            chunk_id += 1
diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py
@@ -17,11 +17,12 @@
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
 from doc_chunk_transform_python import DocChunkPythonTransformConfiguration
-
+from doc_chunk_transform import chunking_types
 
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
 # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md"))
+# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_token_text"))
 output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
 local_conf = {
     "input_folder": input_folder,
@@ -39,6 +40,11 @@
     # doc_chunk params
     # "doc_chunk_chunking_type": "li_markdown",
     "doc_chunk_chunking_type": "dl_json",
+    # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, 
+    # fixed-size params
+    # "doc_chunk_output_chunk_column_name": "chunk_text",
+    # "doc_chunk_chunk_size_tokens": 128,
+    # "doc_chunk_chunk_overlap_tokens": 30
 }
 if __name__ == "__main__":
     # Set the simulated command line args

diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py
@@ -18,7 +18,7 @@
 import pyarrow as pa
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
 from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
-from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown
+from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter
 
 
 short_name = "doc_chunk"
@@ -27,7 +27,10 @@
 doc_id_column_name_key = "doc_id_column_name"
 chunking_type_key = "chunking_type"
 dl_min_chunk_len_key = "dl_min_chunk_len"
+chunk_size_tokens_key = "chunk_size_tokens"
+chunk_overlap_tokens_key = "chunk_overlap_tokens"
 output_chunk_column_name_key = "output_chunk_column_name"
+output_chunk_column_id_key = "output_chunk_column_id"
 output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
 output_jsonpath_column_name_key = "output_jsonpath_column_name"
 output_pageno_column_name_key = "output_pageno_column_name"
@@ -41,11 +44,13 @@
 output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
 output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
 output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"
-
+chunk_size_tokens_cli_param = f"{cli_prefix}{chunk_size_tokens_key}"
+chunk_overlap_tokens_cli_param = f"{cli_prefix}{chunk_overlap_tokens_key}"
 
 class chunking_types(str, enum.Enum):
     LI_MARKDOWN = "li_markdown"
     DL_JSON = "dl_json"
+    LI_TOKEN_TEXT = "li_token_text"
 
     def __str__(self):
         return str(self.value)
@@ -56,11 +61,13 @@ def __str__(self):
 default_chunking_type = chunking_types.DL_JSON
 default_dl_min_chunk_len = None
 default_output_chunk_column_name = "contents"
+default_output_chunk_column_id = "chunk_id"
 default_output_source_doc_id_column_name = "source_document_id"
 default_output_jsonpath_column_name = "doc_jsonpath"
 default_output_pageno_column_name = "page_number"
 default_output_bbox_column_name = "bbox"
-
+default_chunk_size_tokens = 128
+default_chunk_overlap_tokens = 30
 
 class DocChunkTransform(AbstractTableTransform):
     """
@@ -84,6 +91,7 @@ def __init__(self, config: dict[str, Any]):
         self.content_column_name = config.get(content_column_name_key, default_content_column_name)
         self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
         self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
+        self.output_chunk_column_id = config.get(output_chunk_column_id_key, default_output_chunk_column_id)
         self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
 
         # Parameters for Docling JSON chunking
@@ -96,6 +104,10 @@ def __init__(self, config: dict[str, Any]):
         )
         self.output_bbox_column_name_key = config.get(output_bbox_column_name_key, default_output_bbox_column_name)
 
+        # Parameters for Fixed-size with overlap chunking 
+        self.chunk_size_tokens = config.get(chunk_size_tokens_key, default_chunk_size_tokens)
+        self.chunk_overlap_tokens = config.get(chunk_overlap_tokens_key, default_chunk_overlap_tokens)
+
         # Initialize chunker
 
         self.chunker: ChunkingExecutor
@@ -111,6 +123,13 @@ def __init__(self, config: dict[str, Any]):
             self.chunker = LIMarkdown(
                 output_chunk_column_name=self.output_chunk_column_name,
             )
+        elif self.chunking_type == chunking_types.LI_TOKEN_TEXT:
+            self.chunker = LITokenTextSplitter(
+                output_chunk_column_name=self.output_chunk_column_name,
+                output_chunk_column_id=self.output_chunk_column_id,
+                chunk_size_tokens=self.chunk_size_tokens,
+                chunk_overlap_tokens=self.chunk_overlap_tokens
+            )
         else:
             raise RuntimeError(f"{self.chunking_type=} is not valid.")
 
@@ -213,6 +232,18 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=default_output_bbox_column_name,
             help="Column name to store the bbox of the chunk",
         )
+        parser.add_argument(
+            f"--{chunk_size_tokens_cli_param}",
+            default=default_chunk_size_tokens,
+            type=int,
+            help="Size of the chunk in tokens for the fixed-sized chunker",
+        )
+        parser.add_argument(
+            f"--{chunk_overlap_tokens_cli_param}",
+            default=default_chunk_overlap_tokens,
+            type=int,
+            help="Number of tokens overlapping between chunks for the fixed-sized chunker.",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """

diff --git a/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json b/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json
@@ -0,0 +1,56 @@
+{
+  "pipeline": "pipeline_id",
+  "job details": {
+    "job category": "preprocessing",
+    "job name": "doc_chunk",
+    "job type": "pure python",
+    "job id": "job_id",
+    "start_time": "2024-10-04 14:00:40",
+    "end_time": "2024-10-04 14:00:41",
+    "status": "success"
+  },
+  "code": {
+    "github": "github",
+    "commit_hash": "12345",
+    "path": "path"
+  },
+  "job_input_params": {
+    "chunking_type": "li_token_text",
+    "content_column_name": "contents",
+    "doc_id_column_name": "document_id",
+    "dl_min_chunk_len": null,
+    "output_chunk_column_name": "chunk_text",
+    "output_source_doc_id_column_name": "source_document_id",
+    "output_jsonpath_column_name": "doc_jsonpath",
+    "output_pageno_column_name": "page_number",
+    "output_bbox_column_name": "bbox",
+    "chunk_size_tokens": 128,
+    "chunk_overlap_tokens": 30,
+    "checkpointing": false,
+    "max_files": -1,
+    "random_samples": -1,
+    "files_to_use": [
+      ".parquet"
+    ],
+    "num_processors": 0
+  },
+  "job_output_stats": {
+    "source_files": 1,
+    "source_size": 17749,
+    "result_files": 1,
+    "result_size": 8827,
+    "processing_time": 0.194,
+    "nfiles": 1,
+    "nrows": 10,
+    "source_doc_count": 2,
+    "result_doc_count": 10
+  },
+  "source": {
+    "name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/test-data/input_token_text",
+    "type": "path"
+  },
+  "target": {
+    "name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/output",
+    "type": "path"
+  }
+}
diff --git a/transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet b/transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet
diff --git a/transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet b/transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet
diff --git a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py b/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py
@@ -16,7 +16,11 @@
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
-from doc_chunk_transform import chunking_type_cli_param, chunking_types
+from doc_chunk_transform import (
+    chunking_type_cli_param, 
+    output_chunk_column_name_cli_param,
+    chunking_types
+)
 from doc_chunk_transform_python import DocChunkPythonTransformConfiguration
 
 
@@ -55,4 +59,17 @@ def get_test_transform_fixtures(self) -> list[tuple]:
                 basedir + "/expected_md",
             )
         )
+
+        # Run with fixed size token chunker
+        fixtures.append(
+            (
+                launcher,
+                {
+                    chunking_type_cli_param: chunking_types.LI_TOKEN_TEXT,
+                    output_chunk_column_name_cli_param: "chunk_text"
+                },
+                basedir + "/input_token_text",
+                basedir + "/expected_token_text",
+            )
+        )
         return fixtures