diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py index e0fdfa871..0c830ee98 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py @@ -38,6 +38,7 @@ "runtime_job_id": "job_id", "runtime_code_location": ParamsUtils.convert_to_ast(code_location), # doc_chunk params + # "doc_chunk_dl_min_chunk_len": 10, # for testing the usage of the deprecated argument # "doc_chunk_chunking_type": "li_markdown", "doc_chunk_chunking_type": "dl_json", # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py index da5540cba..e64a7c1d1 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py +++ b/transforms/language/doc_chunk/python/src/doc_chunk_transform.py @@ -234,6 +234,11 @@ def add_input_params(self, parser: ArgumentParser) -> None: type=int, help="Number of tokens overlapping between chunks for the fixed-sized chunker.", ) + parser.add_argument( + f"--{cli_prefix}dl_min_chunk_len", + default=None, + help="Deprecated. This option is no longer considered.", + ) def apply_input_params(self, args: Namespace) -> bool: """ @@ -244,5 +249,7 @@ def apply_input_params(self, args: Namespace) -> bool: captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) self.params = self.params | captured + if self.params.get("dl_min_chunk_len") is not None: + self.logger.warning("The `dl_min_chunk_len` option is deprecated and will be ignored. Please stop using it, it will not accepted anymore in future versions.") self.logger.info(f"doc_chunk parameters are : {self.params}") return True