diff --git a/pkg/component/operator/text/v0/README.mdx b/pkg/component/operator/text/v0/README.mdx index e2cb9077..1c0de924 100644 --- a/pkg/component/operator/text/v0/README.mdx +++ b/pkg/component/operator/text/v0/README.mdx @@ -8,6 +8,7 @@ description: "Learn about how to set up a VDP Text component https://github.com/ The Text component is an operator component that allows users to extract and manipulate text from different sources. It can carry out the following tasks: - [Chunk Text](#chunk-text) +- [Clean Data](#clean-data) ## Release Stage @@ -130,3 +131,57 @@ This text splitter is specially designed for Markdown format. | Token Count | `token-count` | integer | Count of tokens in a chunk | + +### Clean Data + +Clean data by removing unwanted text in the input of the array by using the given pattern. + +
+ +| Input | ID | Type | Description | +| :--- | :--- | :--- | :--- | +| Task ID (required) | `task` | string | `TASK_CLEAN_DATA` | +| Texts (required) | `texts` | array[string] | Array of text to be cleaned. | +| [Setting](#clean-data-setting) (required) | `setting` | object | The rules to clean the text. | +
+ + + + +
+The setting Object + +

Setting

+ +`setting` must fulfill one of the following schemas: + +
Regex
+ +
+ +| Field | Field ID | Type | Note | +| :--- | :--- | :--- | :--- | +| Clean Method | `clean-method` | string | Must be `"Regex"` | +| Exclude Patterns | `exclude-patterns` | array | When the text is matched, it will be removed from the array of text. | +| Include Patterns | `include-patterns` | array | When the text is matched, it will be remained in the array of text. And, the exclude-patterns will be executed first. | +
+ +
Substring
+ +
+ +| Field | Field ID | Type | Note | +| :--- | :--- | :--- | :--- | +| Case Sensitive | `case-sensitive` | boolean | A flag indicating whether the substring matching is case-sensitive. When it is true, the matching is case-sensitive. When it is false, the matching is case-insensitive. The default value is false. For example, when it is case-sensitive, cat would only match 'cat' but not 'Cat' or 'CAT'. When cat is case-insensitive, on the other hand, would match 'cat', 'Cat', 'CAT', or any other variation of uppercase and lowercase letters. | +| Clean Method | `clean-method` | string | Must be `"Substring"` | +| Exclude Substring | `exclude-substrings` | array | When the text contains the substrings, it will be removed from the array of text. | +| Include Substring | `include-substrings` | array | When the text contains the substrings, it will be remained in the array of text. And, the exclude-substrings will be executed first. | +
+
+ +
+ +| Output | ID | Type | Description | +| :--- | :--- | :--- | :--- | +| Cleaned Texts | `texts` | array[string] | Array of cleaned text. | +
diff --git a/pkg/component/operator/text/v0/config/definition.json b/pkg/component/operator/text/v0/config/definition.json index feb106d3..4af10cf4 100644 --- a/pkg/component/operator/text/v0/config/definition.json +++ b/pkg/component/operator/text/v0/config/definition.json @@ -1,6 +1,7 @@ { "availableTasks": [ - "TASK_CHUNK_TEXT" + "TASK_CHUNK_TEXT", + "TASK_CLEAN_DATA" ], "custom": false, "documentationUrl": "https://www.instill.tech/docs/component/operator/text", diff --git a/pkg/component/operator/text/v0/config/tasks.json b/pkg/component/operator/text/v0/config/tasks.json index d42c0fe3..190c2be2 100644 --- a/pkg/component/operator/text/v0/config/tasks.json +++ b/pkg/component/operator/text/v0/config/tasks.json @@ -279,17 +279,17 @@ "$ref": "#/$defs/model-name" }, "code-blocks": { - "description": "A flag indicating whether code blocks should be treated as a single unit", + "description": "A flag indicating whether code blocks should be treated as a single unit during chunking.", "instillAcceptFormats": [ "boolean" ], - "instillUIOrder": 3, + "instillUIOrder": 4, "instillUpstreamTypes": [ "value", "reference", "template" ], - "title": "Code Blocks", + "title": "Treat Code Blocks as Single Unit", "type": "boolean" } }, @@ -305,108 +305,140 @@ ], "title": "Markdown", "type": "object", - "description": "This text splitter is specially designed for Markdown format." + "description": "This is a more specialized splitter for markdown documents. It tries to respect the structure of markdown to create semantically meaningful chunks." + }, + { + "properties": { + "chunk-method": { + "const": "Regex", + "type": "string", + "title": "Chunk Method", + "description": "Chunking based on regular expressions.", + "instillUIOrder": 0 + }, + "chunk-size": { + "$ref": "#/$defs/chunk-size" + }, + "chunk-overlap": { + "$ref": "#/$defs/chunk-overlap" + }, + "model-name": { + "$ref": "#/$defs/model-name" + }, + "pattern": { + "description": "Regular expression pattern used for splitting.", + "instillAcceptFormats": [ + "string" + ], + "instillUIOrder": 3, + "instillUpstreamTypes": [ + "value", + "reference", + "template" + ], + "title": "Regex Pattern", + "type": "string" + } + }, + "required": [ + "chunk-method", + "pattern" + ], + "instillEditOnNodeFields": [ + "chunk-method", + "chunk-size", + "chunk-overlap", + "model-name", + "pattern" + ], + "title": "Regex", + "type": "object", + "description": "This method allows for custom chunking based on regular expression patterns." + }, + { + "properties": { + "chunk-method": { + "const": "Substring", + "type": "string", + "title": "Chunk Method", + "description": "Chunking based on substring positions.", + "instillUIOrder": 0 + }, + "chunk-size": { + "$ref": "#/$defs/chunk-size" + }, + "chunk-overlap": { + "$ref": "#/$defs/chunk-overlap" + }, + "model-name": { + "$ref": "#/$defs/model-name" + }, + "start-index": { + "description": "The starting index for the substring.", + "instillAcceptFormats": [ + "integer" + ], + "instillUIOrder": 3, + "instillUpstreamTypes": [ + "value", + "reference", + "template" + ], + "title": "Start Index", + "type": "integer" + }, + "end-index": { + "description": "The ending index for the substring.", + "instillAcceptFormats": [ + "integer" + ], + "instillUIOrder": 4, + "instillUpstreamTypes": [ + "value", + "reference", + "template" + ], + "title": "End Index", + "type": "integer" + } + }, + "required": [ + "chunk-method", + "start-index", + "end-index" + ], + "instillEditOnNodeFields": [ + "chunk-method", + "chunk-size", + "chunk-overlap", + "model-name", + "start-index", + "end-index" + ], + "title": "Substring", + "type": "object", + "description": "This method allows chunking based on specified start and end indices." } ] } }, - "title": "Strategy", "required": [ - "setting" + "text", + "strategy" ], "type": "object" } }, - "required": [ - "text", - "strategy" - ], - "title": "Input", - "type": "object" - }, - "output": { - "description": "Output", - "instillUIOrder": 0, - "properties": { - "chunk-num": { - "description": "Total number of output text chunks", - "instillUIOrder": 2, - "instillFormat": "integer", - "title": "Number of Text Chunks", - "type": "integer" - }, - "text-chunks": { - "description": "Text chunks after splitting", - "instillUIOrder": 1, - "items": { - "title": "Text Chunk", - "description": "Text chunk after splitting", - "properties": { - "text": { - "title": "Text", - "description": "Text chunk after splitting", - "instillFormat": "string", - "instillUIMultiline": true, - "instillUIOrder": 0, - "type": "string" - }, - "start-position": { - "title": "Start Position", - "description": "The starting position of the chunk in the original text", - "instillFormat": "integer", - "instillUIOrder": 1, - "type": "integer" - }, - "end-position": { - "title": "End Position", - "description": "The ending position of the chunk in the original text", - "instillFormat": "integer", - "instillUIOrder": 2, - "type": "integer" - }, - "token-count": { - "title": "Token Count", - "description": "Count of tokens in a chunk", - "instillFormat": "integer", - "instillUIOrder": 3, - "type": "integer" - } - }, - "required": [ - "text", - "start-position", - "end-position", - "token-count" - ], - "instillUIMultiline": true, - "type": "object" - }, - "title": "Text Chunks", - "type": "array" - }, - "token-count": { - "description": "Total count of tokens in the original input text", - "instillUIOrder": 0, - "instillFormat": "integer", - "title": "Token Count", - "type": "integer" + "output": { + "description": "Output", + "instillUIOrder": 0, + "type": "array", + "items": { + "type": "string" }, - "chunks-token-count": { - "description": "Total count of tokens in the output text chunks", - "instillUIOrder": 3, - "instillFormat": "integer", - "title": "Token Count Chunks", - "type": "integer" - } - }, - "required": [ - "text-chunks", - "chunk-num", - "token-count", - "chunks-token-count" - ], - "title": "Output", - "type": "object" + "title": "Output" + } } } } +