diff --git a/cli/magic-create.ts b/cli/magic-create.ts index 435ea3642..09d1b9bcc 100644 --- a/cli/magic-create.ts +++ b/cli/magic-create.ts @@ -13,7 +13,8 @@ import { import { LIB_VERSION } from "./version.js"; import * as fs from "fs"; -const versionRegExp = /\d+.\d+.\d+/; +const iamRoleRegExp = RegExp(/arn:aws:iam::\d+:role\/[\w-_]+/); +const kendraIdRegExp = RegExp(/^\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$/); const embeddingModels = [ { @@ -146,7 +147,7 @@ async function processCreateOptions(options: any): Promise { message: "Cross account role arn to invoke Bedrock - leave empty if Bedrock is in same account", validate: (v: string) => { - const valid = RegExp(/arn:aws:iam::\d+:role\/[\w-_]+/).test(v); + const valid = iamRoleRegExp.test(v); return v.length === 0 || valid; }, initial: options.bedrockRoleArn || "", @@ -174,7 +175,7 @@ async function processCreateOptions(options: any): Promise { { message: "OpenSearch", name: "opensearch" }, { message: "Kendra (managed)", name: "kendra" }, ], - skip: function (): boolean { + skip(): boolean { // workaround for https://github.com/enquirer/enquirer/issues/298 (this as any).state._choices = (this as any).state.choices; return !(this as any).state.answers.enableRag; @@ -199,20 +200,32 @@ async function processCreateOptions(options: any): Promise { const answers: any = await enquirer.prompt(questions); const kendraExternal = []; let newKendra = answers.enableRag && answers.kendra; - + const existingKendraIndices = Array.from(options.kendraExternal); while (newKendra === true) { + let existingIndex: any = existingKendraIndices.pop(); + console.log( + existingIndex?.region, + Object.values(SupportedRegion).indexOf(existingIndex?.region) + ); const kendraQ = [ { type: "input", name: "name", message: "Kendra source name", + validate(v: string) { + return RegExp(/^\w[\w-_]*\w$/).test(v); + }, + initial: existingIndex?.name, }, { type: "autocomplete", limit: 8, name: "region", choices: Object.values(SupportedRegion), - message: "Region of the Kendra index", + message: `Region of the Kendra index${ + existingIndex?.region ? " (" + existingIndex?.region + ")" : "" + }`, + initial: Object.values(SupportedRegion).indexOf(existingIndex?.region), }, { type: "input", @@ -220,28 +233,36 @@ async function processCreateOptions(options: any): Promise { message: "Cross account role Arn to assume to call Kendra, leave empty if not needed", validate: (v: string) => { - const valid = RegExp(/arn:aws:iam::\d+:role\/[\w-_]+/).test(v); + const valid = iamRoleRegExp.test(v); return v.length === 0 || valid; }, - initial: "", + initial: existingIndex?.roleArn ?? "", }, { type: "input", name: "kendraId", message: "Kendra ID", validate(v: string) { - return RegExp(/\w{8}-\w{4}-\w{4}-\w{4}-\w{12}/).test(v); + return kendraIdRegExp.test(v); }, + initial: existingIndex?.kendraId, + }, + { + type: "confirm", + name: "enabled", + message: "Enable this index", + initial: existingIndex?.enabled ?? true, }, { type: "confirm", name: "newKendra", message: "Do you want to add another Kendra source", - default: false, + initial: false, }, ]; const kendraInstance: any = await enquirer.prompt(kendraQ); - const ext = (({ name, roleArn, kendraId, region }) => ({ + const ext = (({ enabled, name, roleArn, kendraId, region }) => ({ + enabled, name, roleArn, kendraId, @@ -249,7 +270,6 @@ async function processCreateOptions(options: any): Promise { }))(kendraInstance); if (ext.roleArn === "") ext.roleArn = undefined; kendraExternal.push({ - enabled: true, ...ext, }); newKendra = kendraInstance.newKendra; @@ -261,6 +281,9 @@ async function processCreateOptions(options: any): Promise { message: "Which is the default embedding model", choices: embeddingModels.map((m) => ({ name: m.name, value: m })), initial: options.defaultEmbedding || undefined, + skip(): boolean { + return !(this as any).state.answers.enableRag; + }, }, ]; const models: any = await enquirer.prompt(modelsPrompts); @@ -296,27 +319,33 @@ async function processCreateOptions(options: any): Promise { }, }, embeddingsModels: [{}], - crossEncoderModels: [ - { - provider: "sagemaker", - name: "cross-encoder/ms-marco-MiniLM-L-12-v2", - default: true, - }, - ], + crossEncoderModels: [{}], }, }; - config.rag.engines.kendra.createIndex = - answers.ragsToEnable.includes("kendra"); - config.rag.engines.kendra.enabled = - config.rag.engines.kendra.createIndex || kendraExternal.length > 0; - config.rag.engines.kendra.external = [...kendraExternal]; + // If we have not enabled rag the default embedding is set to the first model + if (!answers.enableRag) { + models.defaultEmbedding = embeddingModels[0].name; + } + + config.rag.crossEncoderModels[0] = { + provider: "sagemaker", + name: "cross-encoder/ms-marco-MiniLM-L-12-v2", + default: true, + }; config.rag.embeddingsModels = embeddingModels; config.rag.embeddingsModels.forEach((m: any) => { if (m.name === models.defaultEmbedding) { m.default = true; } }); + + config.rag.engines.kendra.createIndex = + answers.ragsToEnable.includes("kendra"); + config.rag.engines.kendra.enabled = + config.rag.engines.kendra.createIndex || kendraExternal.length > 0; + config.rag.engines.kendra.external = [...kendraExternal]; + console.log("\n✨ This is the chosen configuration:\n"); console.log(JSON.stringify(config, undefined, 2)); ( diff --git a/lib/chatbot-api/functions/api-handler/routes/documents.py b/lib/chatbot-api/functions/api-handler/routes/documents.py index 06fe3f0a7..63b4ee3f5 100644 --- a/lib/chatbot-api/functions/api-handler/routes/documents.py +++ b/lib/chatbot-api/functions/api-handler/routes/documents.py @@ -67,8 +67,7 @@ def file_upload(workspace_id: str): if extension not in allowed_extensions: raise genai_core.types.CommonError("Invalid file extension") - result = genai_core.upload.generate_presigned_post( - workspace_id, request.fileName) + result = genai_core.upload.generate_presigned_post(workspace_id, request.fileName) return {"ok": True, "data": result} @@ -147,7 +146,7 @@ def add_document(workspace_id: str, document_type: str): crawler_properties={ "follow_links": request.followLinks, "limit": request.limit, - } + }, ) return { diff --git a/lib/chatbot-api/functions/api-handler/routes/workspaces.py b/lib/chatbot-api/functions/api-handler/routes/workspaces.py index 84a6c26b1..5606de543 100644 --- a/lib/chatbot-api/functions/api-handler/routes/workspaces.py +++ b/lib/chatbot-api/functions/api-handler/routes/workspaces.py @@ -77,6 +77,14 @@ def workspace(workspace_id: str): return {"ok": True, "data": ret_value} +@router.delete("/workspaces/") +@tracer.capture_method +def workspace(workspace_id: str): + genai_core.workspaces.delete_workspace(workspace_id) + + return {"ok": True} + + @router.put("/workspaces") @tracer.capture_method def create_workspace(): diff --git a/lib/chatbot-api/rest-api.ts b/lib/chatbot-api/rest-api.ts index 23f587b8d..6e289e4cd 100644 --- a/lib/chatbot-api/rest-api.ts +++ b/lib/chatbot-api/rest-api.ts @@ -77,6 +77,8 @@ export class RestApi extends Construct { props.ragEngines?.documentsByCompountKeyIndexName ?? "", SAGEMAKER_RAG_MODELS_ENDPOINT: props.ragEngines?.sageMakerRagModelsEndpoint?.attrEndpointName ?? "", + DELETE_WORKSPACE_WORKFLOW_ARN: + props.ragEngines?.deleteWorkspaceWorkflow?.stateMachineArn ?? "", CREATE_AURORA_WORKSPACE_WORKFLOW_ARN: props.ragEngines?.auroraPgVector?.createAuroraWorkspaceWorkflow ?.stateMachineArn ?? "", @@ -206,6 +208,10 @@ export class RestApi extends Construct { props.ragEngines.websiteCrawlingWorkflow.grantStartExecution(apiHandler); } + if (props.ragEngines?.deleteWorkspaceWorkflow) { + props.ragEngines.deleteWorkspaceWorkflow.grantStartExecution(apiHandler); + } + if (props.ragEngines?.sageMakerRagModelsEndpoint) { apiHandler.addToRolePolicy( new iam.PolicyStatement({ diff --git a/lib/rag-engines/data-import/functions/website-crawling-workflow/website-parser/index.py b/lib/rag-engines/data-import/functions/website-crawling-workflow/website-parser/index.py index 45299dd58..0a6801124 100644 --- a/lib/rag-engines/data-import/functions/website-crawling-workflow/website-parser/index.py +++ b/lib/rag-engines/data-import/functions/website-crawling-workflow/website-parser/index.py @@ -19,7 +19,7 @@ def lambda_handler(event, context: LambdaContext): workspace_id = event["workspace_id"] document_id = event["document_id"] response = s3_client.get_object(Bucket=bucket_name, Key=object_key) - file_content = response['Body'].read().decode('utf-8') + file_content = response["Body"].read().decode("utf-8") data = json.loads(file_content) iteration = data["iteration"] @@ -31,8 +31,7 @@ def lambda_handler(event, context: LambdaContext): follow_links = data["follow_links"] limit = data["limit"] - logger.info( - f"Processing document {document_id} in workspace {workspace_id}") + logger.info(f"Processing document {document_id} in workspace {workspace_id}") logger.info(f"Workspace: {workspace}") logger.info(f"Document: {document}") logger.info(f"Limit: {limit}") @@ -55,7 +54,9 @@ def lambda_handler(event, context: LambdaContext): result["iteration"] = iteration result["crawler_job_id"] = crawler_job_id - iteration_object_key = f"{workspace_id}/{document_id}/crawler/{crawler_job_id}/{iteration}.json" + iteration_object_key = ( + f"{workspace_id}/{document_id}/crawler/{crawler_job_id}/{iteration}.json" + ) s3_client.put_object( Bucket=PROCESSING_BUCKET_NAME, Key=iteration_object_key, diff --git a/lib/rag-engines/index.ts b/lib/rag-engines/index.ts index a869d7c3f..247574157 100644 --- a/lib/rag-engines/index.ts +++ b/lib/rag-engines/index.ts @@ -7,6 +7,7 @@ import { DataImport } from "./data-import"; import { RagDynamoDBTables } from "./rag-dynamodb-tables"; import { OpenSearchVector } from "./opensearch-vector"; import { KendraRetrieval } from "./kendra-retrieval"; +import { Workspaces } from "./workspaces"; import * as sagemaker from "aws-cdk-lib/aws-sagemaker"; import * as s3 from "aws-cdk-lib/aws-s3"; import * as dynamodb from "aws-cdk-lib/aws-dynamodb"; @@ -30,6 +31,7 @@ export class RagEngines extends Construct { public readonly sageMakerRagModelsEndpoint: sagemaker.CfnEndpoint; public readonly fileImportWorkflow?: sfn.StateMachine; public readonly websiteCrawlingWorkflow?: sfn.StateMachine; + public readonly deleteWorkspaceWorkflow?: sfn.StateMachine; constructor(scope: Construct, id: string, props: RagEnginesProps) { super(scope, id); @@ -86,6 +88,16 @@ export class RagEngines extends Construct { kendraRetrieval: kendraRetrieval ?? undefined, }); + const workspaces = new Workspaces(this, "Workspaces", { + shared: props.shared, + config: props.config, + dataImport, + ragDynamoDBTables: tables, + auroraPgVector: auroraPgVector ?? undefined, + openSearchVector: openSearchVector ?? undefined, + kendraRetrieval: kendraRetrieval ?? undefined, + }); + this.auroraPgVector = auroraPgVector; this.openSearchVector = openSearchVector; this.kendraRetrieval = kendraRetrieval; @@ -100,5 +112,6 @@ export class RagEngines extends Construct { tables.documentsByCompountKeyIndexName; this.fileImportWorkflow = dataImport.fileImportWorkflow; this.websiteCrawlingWorkflow = dataImport.websiteCrawlingWorkflow; + this.deleteWorkspaceWorkflow = workspaces.deleteWorkspaceWorkflow; } } diff --git a/lib/rag-engines/workspaces/delete-workspace.ts b/lib/rag-engines/workspaces/delete-workspace.ts new file mode 100644 index 000000000..26ac89984 --- /dev/null +++ b/lib/rag-engines/workspaces/delete-workspace.ts @@ -0,0 +1,171 @@ +import * as path from "path"; +import * as cdk from "aws-cdk-lib"; +import { Construct } from "constructs"; +import { SystemConfig } from "../../shared/types"; +import { Shared } from "../../shared"; +import { OpenSearchVector } from "../opensearch-vector"; +import { KendraRetrieval } from "../kendra-retrieval"; +import { AuroraPgVector } from "../aurora-pgvector"; +import { DataImport } from "../data-import"; +import { RagDynamoDBTables } from "../rag-dynamodb-tables"; +import * as sfn from "aws-cdk-lib/aws-stepfunctions"; +import * as tasks from "aws-cdk-lib/aws-stepfunctions-tasks"; +import * as lambda from "aws-cdk-lib/aws-lambda"; +import * as logs from "aws-cdk-lib/aws-logs"; +import * as iam from "aws-cdk-lib/aws-iam"; + +export interface DeleteWorkspaceProps { + readonly config: SystemConfig; + readonly shared: Shared; + readonly dataImport: DataImport; + readonly ragDynamoDBTables: RagDynamoDBTables; + readonly auroraPgVector?: AuroraPgVector; + readonly openSearchVector?: OpenSearchVector; + readonly kendraRetrieval?: KendraRetrieval; +} + +export class DeleteWorkspace extends Construct { + public readonly stateMachine?: sfn.StateMachine; + + constructor(scope: Construct, id: string, props: DeleteWorkspaceProps) { + super(scope, id); + + const deleteFunction = new lambda.Function( + this, + "DeleteWorkspaceFunction", + { + vpc: props.shared.vpc, + code: lambda.Code.fromAsset( + path.join(__dirname, "./functions/delete-workspace-workflow/delete") + ), + runtime: props.shared.pythonRuntime, + architecture: props.shared.lambdaArchitecture, + handler: "index.lambda_handler", + layers: [ + props.shared.powerToolsLayer, + props.shared.commonLayer.layer, + props.shared.pythonSDKLayer, + ], + timeout: cdk.Duration.minutes(15), + logRetention: logs.RetentionDays.ONE_WEEK, + environment: { + ...props.shared.defaultEnvironmentVariables, + AURORA_DB_SECRET_ID: props.auroraPgVector?.database.secret + ?.secretArn as string, + UPLOAD_BUCKET_NAME: props.dataImport.uploadBucket.bucketName, + PROCESSING_BUCKET_NAME: props.dataImport.processingBucket.bucketName, + WORKSPACES_TABLE_NAME: + props.ragDynamoDBTables.workspacesTable.tableName, + WORKSPACES_BY_OBJECT_TYPE_INDEX_NAME: + props.ragDynamoDBTables.workspacesByObjectTypeIndexName, + DOCUMENTS_TABLE_NAME: + props.ragDynamoDBTables?.documentsTable.tableName ?? "", + DOCUMENTS_BY_COMPOUND_KEY_INDEX_NAME: + props.ragDynamoDBTables?.documentsByCompountKeyIndexName ?? "", + DEFAULT_KENDRA_S3_DATA_SOURCE_BUCKET_NAME: + props.kendraRetrieval?.kendraS3DataSourceBucket?.bucketName ?? "", + OPEN_SEARCH_COLLECTION_ENDPOINT: + props.openSearchVector?.openSearchCollectionEndpoint ?? "", + }, + } + ); + + if (props.auroraPgVector) { + props.auroraPgVector.database.secret?.grantRead(deleteFunction); + props.auroraPgVector.database.connections.allowDefaultPortFrom( + deleteFunction + ); + } + + if (props.openSearchVector) { + deleteFunction.addToRolePolicy( + new iam.PolicyStatement({ + actions: [ + "aoss:APIAccessAll", + "aoss:DescribeIndex", + "aoss:DeleteIndex", + ], + resources: [props.openSearchVector.openSearchCollection.attrArn], + }) + ); + + props.openSearchVector.addToAccessPolicy( + "delete-workspace", + [deleteFunction.role?.roleArn], + [ + "aoss:DeleteIndex", + "aoss:DescribeIndex", + "aoss:ReadDocument", + "aoss:WriteDocument", + ] + ); + } + + props.dataImport.uploadBucket.grantReadWrite(deleteFunction); + props.dataImport.processingBucket.grantReadWrite(deleteFunction); + props.kendraRetrieval?.kendraS3DataSourceBucket?.grantReadWrite( + deleteFunction + ); + props.ragDynamoDBTables.workspacesTable.grantReadWriteData(deleteFunction); + props.ragDynamoDBTables.documentsTable.grantReadWriteData(deleteFunction); + + const handleError = new tasks.DynamoUpdateItem(this, "HandleError", { + table: props.ragDynamoDBTables.workspacesTable, + key: { + workspace_id: tasks.DynamoAttributeValue.fromString( + sfn.JsonPath.stringAt("$.workspace_id") + ), + object_type: tasks.DynamoAttributeValue.fromString("workspace"), + }, + updateExpression: "set #status = :error", + expressionAttributeNames: { + "#status": "status", + }, + expressionAttributeValues: { + ":error": tasks.DynamoAttributeValue.fromString("error"), + }, + }).next( + new sfn.Fail(this, "Fail", { + cause: "Workspace deletion failed", + }) + ); + + const setDeleting = new tasks.DynamoUpdateItem(this, "SetDeleting", { + table: props.ragDynamoDBTables.workspacesTable, + key: { + workspace_id: tasks.DynamoAttributeValue.fromString( + sfn.JsonPath.stringAt("$.workspace_id") + ), + object_type: tasks.DynamoAttributeValue.fromString("workspace"), + }, + updateExpression: "set #status=:statusValue", + expressionAttributeNames: { + "#status": "status", + }, + expressionAttributeValues: { + ":statusValue": tasks.DynamoAttributeValue.fromString("deleting"), + }, + resultPath: sfn.JsonPath.DISCARD, + }); + + const deleteTask = new tasks.LambdaInvoke(this, "Delete", { + lambdaFunction: deleteFunction, + resultPath: "$.deleteResult", + }).addCatch(handleError, { + errors: ["States.ALL"], + resultPath: "$.deleteResult", + }); + + const workflow = setDeleting + .next(deleteTask) + .next(new sfn.Succeed(this, "Success")); + + const stateMachine = new sfn.StateMachine(this, "DeleteWorkspace", { + definitionBody: sfn.DefinitionBody.fromChainable(workflow), + timeout: cdk.Duration.minutes(5), + comment: "Delete Workspace Workflow", + }); + + this.stateMachine = stateMachine; + } +} diff --git a/lib/rag-engines/workspaces/functions/delete-workspace-workflow/delete/index.py b/lib/rag-engines/workspaces/functions/delete-workspace-workflow/delete/index.py new file mode 100644 index 000000000..a0a2abb18 --- /dev/null +++ b/lib/rag-engines/workspaces/functions/delete-workspace-workflow/delete/index.py @@ -0,0 +1,26 @@ +import genai_core.types +import genai_core.workspaces +import genai_core.aurora.delete +import genai_core.opensearch.delete +import genai_core.kendra.delete +from aws_lambda_powertools import Logger +from aws_lambda_powertools.utilities.typing import LambdaContext + +logger = Logger() + + +@logger.inject_lambda_context(log_event=True) +def lambda_handler(event, context: LambdaContext): + workspace_id = event["workspace_id"] + workspace = genai_core.workspaces.get_workspace(workspace_id) + if workspace is None: + raise genai_core.types.CommonError("Workspace not found") + + if workspace["engine"] == "aurora": + genai_core.aurora.delete.delete_aurora_workspace(workspace) + elif workspace["engine"] == "opensearch": + genai_core.opensearch.delete.delete_open_search_workspace(workspace) + elif workspace["engine"] == "kendra": + genai_core.kendra.delete.delete_kendra_workspace(workspace) + else: + raise genai_core.types.CommonError("Workspace engine not supported") diff --git a/lib/rag-engines/workspaces/index.ts b/lib/rag-engines/workspaces/index.ts new file mode 100644 index 000000000..601d48613 --- /dev/null +++ b/lib/rag-engines/workspaces/index.ts @@ -0,0 +1,40 @@ +import { Construct } from "constructs"; +import { SystemConfig } from "../../shared/types"; +import { Shared } from "../../shared"; +import { DeleteWorkspace } from "./delete-workspace"; +import { AuroraPgVector } from "../aurora-pgvector"; +import { OpenSearchVector } from "../opensearch-vector"; +import { KendraRetrieval } from "../kendra-retrieval"; +import { DataImport } from "../data-import"; +import { RagDynamoDBTables } from "../rag-dynamodb-tables"; +import * as sfn from "aws-cdk-lib/aws-stepfunctions"; + +export interface WorkkspacesProps { + readonly config: SystemConfig; + readonly shared: Shared; + readonly dataImport: DataImport; + readonly ragDynamoDBTables: RagDynamoDBTables; + readonly auroraPgVector?: AuroraPgVector; + readonly openSearchVector?: OpenSearchVector; + readonly kendraRetrieval?: KendraRetrieval; +} + +export class Workspaces extends Construct { + public readonly deleteWorkspaceWorkflow?: sfn.StateMachine; + + constructor(scope: Construct, id: string, props: WorkkspacesProps) { + super(scope, id); + + const workflow = new DeleteWorkspace(this, "DeleteWorkspace", { + config: props.config, + shared: props.shared, + dataImport: props.dataImport, + ragDynamoDBTables: props.ragDynamoDBTables, + auroraPgVector: props.auroraPgVector, + openSearchVector: props.openSearchVector, + kendraRetrieval: props.kendraRetrieval, + }); + + this.deleteWorkspaceWorkflow = workflow.stateMachine; + } +} diff --git a/lib/shared/layers/python-sdk/python/genai_core/aurora/delete.py b/lib/shared/layers/python-sdk/python/genai_core/aurora/delete.py new file mode 100644 index 000000000..ff2e59c95 --- /dev/null +++ b/lib/shared/layers/python-sdk/python/genai_core/aurora/delete.py @@ -0,0 +1,71 @@ +import os +import boto3 +import genai_core.utils.delete_files_with_prefix +from psycopg2 import sql +from genai_core.aurora.connection import AuroraConnection + +PROCESSING_BUCKET_NAME = os.environ["PROCESSING_BUCKET_NAME"] +UPLOAD_BUCKET_NAME = os.environ["UPLOAD_BUCKET_NAME"] +WORKSPACES_TABLE_NAME = os.environ["WORKSPACES_TABLE_NAME"] +DOCUMENTS_TABLE_NAME = os.environ.get("DOCUMENTS_TABLE_NAME") + +WORKSPACE_OBJECT_TYPE = "workspace" + +dynamodb = boto3.resource("dynamodb") + + +def delete_aurora_workspace(workspace: dict): + workspace_id = workspace["workspace_id"] + genai_core.utils.delete_files_with_prefix.delete_files_with_prefix( + UPLOAD_BUCKET_NAME, workspace_id + ) + genai_core.utils.delete_files_with_prefix.delete_files_with_prefix( + PROCESSING_BUCKET_NAME, workspace_id + ) + + table_name = sql.Identifier(workspace_id.replace("-", "")) + with AuroraConnection(autocommit=False) as cursor: + cursor.execute( + sql.SQL("DROP TABLE IF EXISTS {table};").format(table=table_name) + ) + + workspaces_table = dynamodb.Table(WORKSPACES_TABLE_NAME) + documents_table = dynamodb.Table(DOCUMENTS_TABLE_NAME) + + items_to_delete = [] + last_evaluated_key = None + while True: + query_args = { + "KeyConditionExpression": boto3.dynamodb.conditions.Key("workspace_id").eq( + workspace_id + ) + } + + if last_evaluated_key: + query_args["ExclusiveStartKey"] = last_evaluated_key + + response = documents_table.query(**query_args) + items_to_delete.extend(response["Items"]) + + last_evaluated_key = response.get("LastEvaluatedKey") + if not last_evaluated_key: + break + + # Batch delete in groups of 25 + for i in range(0, len(items_to_delete), 25): + with documents_table.batch_writer() as batch: + for item in items_to_delete[i : i + 25]: + batch.delete_item( + Key={ + "workspace_id": item["workspace_id"], + "document_id": item["document_id"], + } + ) + + print(f"Deleted {len(items_to_delete)} items.") + + response = workspaces_table.delete_item( + Key={"workspace_id": workspace_id, "object_type": WORKSPACE_OBJECT_TYPE}, + ) + + print(f"Delete Item succeeded: {response}") diff --git a/lib/shared/layers/python-sdk/python/genai_core/chunks.py b/lib/shared/layers/python-sdk/python/genai_core/chunks.py index 09d154dc0..ebadd1b70 100644 --- a/lib/shared/layers/python-sdk/python/genai_core/chunks.py +++ b/lib/shared/layers/python-sdk/python/genai_core/chunks.py @@ -44,8 +44,7 @@ def add_chunks( ) chunk_ids = [uuid.uuid4() for _ in chunks] - store_chunks_on_s3(workspace_id, document_id, - document_sub_id, chunk_ids, chunks) + store_chunks_on_s3(workspace_id, document_id, document_sub_id, chunk_ids, chunks) if engine == "aurora": result = genai_core.aurora.chunks.add_chunks_aurora( diff --git a/lib/shared/layers/python-sdk/python/genai_core/documents.py b/lib/shared/layers/python-sdk/python/genai_core/documents.py index c7142b743..27f057ee9 100644 --- a/lib/shared/layers/python-sdk/python/genai_core/documents.py +++ b/lib/shared/layers/python-sdk/python/genai_core/documents.py @@ -170,8 +170,7 @@ def get_document_content(workspace_id: str, document_id: str): if genai_core.utils.files.file_exists( PROCESSING_BUCKET_NAME, content_complement_key ): - response = s3.Object(PROCESSING_BUCKET_NAME, - content_complement_key).get() + response = s3.Object(PROCESSING_BUCKET_NAME, content_complement_key).get() content_complement = response["Body"].read().decode("utf-8") return {"content": content, "content_complement": content_complement} @@ -313,7 +312,11 @@ def create_document( ) else: _process_document( - workspace, document, content=content, content_complement=content_complement, **kwargs + workspace, + document, + content=content, + content_complement=content_complement, + **kwargs, ) return { @@ -352,8 +355,7 @@ def _process_document_kendra( metadata["Title"] = title s3_client.copy_object( - CopySource={"Bucket": PROCESSING_BUCKET_NAME, - "Key": processing_object_key}, + CopySource={"Bucket": PROCESSING_BUCKET_NAME, "Key": processing_object_key}, Bucket=DEFAULT_KENDRA_S3_DATA_SOURCE_BUCKET_NAME, Key=kendra_object_key, ) @@ -424,38 +426,39 @@ def _process_document( follow_links = False try: - urls_to_crawl = genai_core.websites.extract_urls_from_sitemap( - path) + urls_to_crawl = genai_core.websites.extract_urls_from_sitemap(path) if len(urls_to_crawl) == 0: set_status(workspace_id, document_id, "error") - raise genai_core.types.CommonError( - "No urls found in sitemap") + raise genai_core.types.CommonError("No urls found in sitemap") except Exception as e: print(e) set_status(workspace_id, document_id, "error") - raise genai_core.types.CommonError( - "Error extracting urls from sitemap") + raise genai_core.types.CommonError("Error extracting urls from sitemap") iteration = 1 crawler_job_id = str(uuid.uuid4()) - iteration_object_key = f"{workspace_id}/{document_id}/crawler/{crawler_job_id}/{iteration}.json" - priority_queue = [{"url": url, "priority": 1} - for url in set(urls_to_crawl)] + iteration_object_key = ( + f"{workspace_id}/{document_id}/crawler/{crawler_job_id}/{iteration}.json" + ) + priority_queue = [{"url": url, "priority": 1} for url in set(urls_to_crawl)] s3_client.put_object( - Body=json.dumps({ - "iteration": iteration, - "crawler_job_id": crawler_job_id, - "workspace_id": workspace_id, - "document_id": document_id, - "workspace": workspace, - "document": document, - "priority_queue": priority_queue, - "processed_urls": [], - "follow_links": follow_links, - "limit": limit, - "done": False, - }, cls=genai_core.utils.json.CustomEncoder), + Body=json.dumps( + { + "iteration": iteration, + "crawler_job_id": crawler_job_id, + "workspace_id": workspace_id, + "document_id": document_id, + "workspace": workspace, + "document": document, + "priority_queue": priority_queue, + "processed_urls": [], + "follow_links": follow_links, + "limit": limit, + "done": False, + }, + cls=genai_core.utils.json.CustomEncoder, + ), Bucket=PROCESSING_BUCKET_NAME, Key=iteration_object_key, ContentType="application/json", @@ -463,13 +466,16 @@ def _process_document( response = sfn_client.start_execution( stateMachineArn=WEBSITE_CRAWLING_WORKFLOW_ARN, - input=json.dumps({ - "workspace_id": workspace_id, - "document_id": document_id, - "bucket_name": PROCESSING_BUCKET_NAME, - "object_key": iteration_object_key, - "done": False, - }, cls=genai_core.utils.json.CustomEncoder), + input=json.dumps( + { + "workspace_id": workspace_id, + "document_id": document_id, + "bucket_name": PROCESSING_BUCKET_NAME, + "object_key": iteration_object_key, + "done": False, + }, + cls=genai_core.utils.json.CustomEncoder, + ), ) print(response) diff --git a/lib/shared/layers/python-sdk/python/genai_core/kendra/delete.py b/lib/shared/layers/python-sdk/python/genai_core/kendra/delete.py new file mode 100644 index 000000000..f8d6f8a30 --- /dev/null +++ b/lib/shared/layers/python-sdk/python/genai_core/kendra/delete.py @@ -0,0 +1,72 @@ +import os +import boto3 +import genai_core.utils.delete_files_with_prefix + +PROCESSING_BUCKET_NAME = os.environ["PROCESSING_BUCKET_NAME"] +UPLOAD_BUCKET_NAME = os.environ["UPLOAD_BUCKET_NAME"] +WORKSPACES_TABLE_NAME = os.environ["WORKSPACES_TABLE_NAME"] +DOCUMENTS_TABLE_NAME = os.environ.get("DOCUMENTS_TABLE_NAME") +DEFAULT_KENDRA_S3_DATA_SOURCE_BUCKET_NAME = os.environ.get( + "DEFAULT_KENDRA_S3_DATA_SOURCE_BUCKET_NAME" +) + +WORKSPACE_OBJECT_TYPE = "workspace" + +dynamodb = boto3.resource("dynamodb") + + +def delete_kendra_workspace(workspace: dict): + workspace_id = workspace["workspace_id"] + genai_core.utils.delete_files_with_prefix.delete_files_with_prefix( + UPLOAD_BUCKET_NAME, workspace_id + ) + genai_core.utils.delete_files_with_prefix.delete_files_with_prefix( + PROCESSING_BUCKET_NAME, workspace_id + ) + genai_core.utils.delete_files_with_prefix.delete_files_with_prefix( + DEFAULT_KENDRA_S3_DATA_SOURCE_BUCKET_NAME, f"documents/{workspace_id}" + ) + genai_core.utils.delete_files_with_prefix.delete_files_with_prefix( + DEFAULT_KENDRA_S3_DATA_SOURCE_BUCKET_NAME, f"metadata/documents/{workspace_id}" + ) + + workspaces_table = dynamodb.Table(WORKSPACES_TABLE_NAME) + documents_table = dynamodb.Table(DOCUMENTS_TABLE_NAME) + + items_to_delete = [] + last_evaluated_key = None + while True: + query_args = { + "KeyConditionExpression": boto3.dynamodb.conditions.Key("workspace_id").eq( + workspace_id + ) + } + + if last_evaluated_key: + query_args["ExclusiveStartKey"] = last_evaluated_key + + response = documents_table.query(**query_args) + items_to_delete.extend(response["Items"]) + + last_evaluated_key = response.get("LastEvaluatedKey") + if not last_evaluated_key: + break + + # Batch delete in groups of 25 + for i in range(0, len(items_to_delete), 25): + with documents_table.batch_writer() as batch: + for item in items_to_delete[i : i + 25]: + batch.delete_item( + Key={ + "workspace_id": item["workspace_id"], + "document_id": item["document_id"], + } + ) + + print(f"Deleted {len(items_to_delete)} items.") + + response = workspaces_table.delete_item( + Key={"workspace_id": workspace_id, "object_type": WORKSPACE_OBJECT_TYPE}, + ) + + print(f"Delete Item succeeded: {response}") diff --git a/lib/shared/layers/python-sdk/python/genai_core/opensearch/delete.py b/lib/shared/layers/python-sdk/python/genai_core/opensearch/delete.py new file mode 100644 index 000000000..8b0377d7b --- /dev/null +++ b/lib/shared/layers/python-sdk/python/genai_core/opensearch/delete.py @@ -0,0 +1,72 @@ +import os +import boto3 +from .client import get_open_search_client +import genai_core.utils.delete_files_with_prefix + + +PROCESSING_BUCKET_NAME = os.environ["PROCESSING_BUCKET_NAME"] +UPLOAD_BUCKET_NAME = os.environ["UPLOAD_BUCKET_NAME"] +WORKSPACES_TABLE_NAME = os.environ["WORKSPACES_TABLE_NAME"] +DOCUMENTS_TABLE_NAME = os.environ.get("DOCUMENTS_TABLE_NAME") + +WORKSPACE_OBJECT_TYPE = "workspace" + +dynamodb = boto3.resource("dynamodb") + + +def delete_open_search_workspace(workspace: dict): + workspace_id = workspace["workspace_id"] + index_name = workspace_id.replace("-", "") + + genai_core.utils.delete_files_with_prefix.delete_files_with_prefix( + UPLOAD_BUCKET_NAME, workspace_id + ) + genai_core.utils.delete_files_with_prefix.delete_files_with_prefix( + PROCESSING_BUCKET_NAME, workspace_id + ) + + client = get_open_search_client() + if client.indices.exists(index_name): + client.indices.delete(index=index_name) + print(f"Index {index_name} deleted.") + + workspaces_table = dynamodb.Table(WORKSPACES_TABLE_NAME) + documents_table = dynamodb.Table(DOCUMENTS_TABLE_NAME) + + items_to_delete = [] + last_evaluated_key = None + while True: + query_args = { + "KeyConditionExpression": boto3.dynamodb.conditions.Key("workspace_id").eq( + workspace_id + ) + } + + if last_evaluated_key: + query_args["ExclusiveStartKey"] = last_evaluated_key + + response = documents_table.query(**query_args) + items_to_delete.extend(response["Items"]) + + last_evaluated_key = response.get("LastEvaluatedKey") + if not last_evaluated_key: + break + + # Batch delete in groups of 25 + for i in range(0, len(items_to_delete), 25): + with documents_table.batch_writer() as batch: + for item in items_to_delete[i : i + 25]: + batch.delete_item( + Key={ + "workspace_id": item["workspace_id"], + "document_id": item["document_id"], + } + ) + + print(f"Deleted {len(items_to_delete)} items.") + + response = workspaces_table.delete_item( + Key={"workspace_id": workspace_id, "object_type": WORKSPACE_OBJECT_TYPE}, + ) + + print(f"Delete Item succeeded: {response}") diff --git a/lib/shared/layers/python-sdk/python/genai_core/utils/delete_files_with_prefix.py b/lib/shared/layers/python-sdk/python/genai_core/utils/delete_files_with_prefix.py new file mode 100644 index 000000000..5ca60f7af --- /dev/null +++ b/lib/shared/layers/python-sdk/python/genai_core/utils/delete_files_with_prefix.py @@ -0,0 +1,33 @@ +import boto3 + + +def delete_files_with_prefix(bucket_name, prefix): + s3_client = boto3.client("s3") + continuation_token = None + + while True: + # If we have a continuation token from the previous response, use it + if continuation_token: + objects_to_delete = s3_client.list_objects_v2( + Bucket=bucket_name, Prefix=prefix, ContinuationToken=continuation_token + ) + else: + objects_to_delete = s3_client.list_objects_v2( + Bucket=bucket_name, Prefix=prefix + ) + + # Prepare the list of objects to delete + if "Contents" in objects_to_delete: + delete_list = [{"Key": obj["Key"]} for obj in objects_to_delete["Contents"]] + + # Delete the objects in a batch + s3_client.delete_objects( + Bucket=bucket_name, Delete={"Objects": delete_list} + ) + + # If there"s no NextContinuationToken in the response, we"ve fetched all objects + if "NextContinuationToken" in objects_to_delete: + continuation_token = objects_to_delete["NextContinuationToken"] + else: + print("Finished deleting all objects with the specified prefix.") + break diff --git a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py index ed40b7949..7829ee4e5 100644 --- a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py +++ b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py @@ -32,8 +32,7 @@ def crawl_urls( if len(priority_queue) == 0: break - priority_queue = sorted( - priority_queue, key=lambda val: val["priority"]) + priority_queue = sorted(priority_queue, key=lambda val: val["priority"]) current = priority_queue.pop(0) current_url = current["url"] current_priority = current["priority"] diff --git a/lib/shared/layers/python-sdk/python/genai_core/workspaces.py b/lib/shared/layers/python-sdk/python/genai_core/workspaces.py index 8d7921977..d7490950e 100644 --- a/lib/shared/layers/python-sdk/python/genai_core/workspaces.py +++ b/lib/shared/layers/python-sdk/python/genai_core/workspaces.py @@ -21,6 +21,7 @@ CREATE_KENDRA_WORKSPACE_WORKFLOW_ARN = os.environ.get( "CREATE_KENDRA_WORKSPACE_WORKFLOW_ARN" ) +DELETE_WORKSPACE_WORKFLOW_ARN = os.environ.get("DELETE_WORKSPACE_WORKFLOW_ARN") WORKSPACE_OBJECT_TYPE = "workspace" @@ -105,7 +106,8 @@ def create_workspace_aurora( timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ") embeddings_model = genai_core.embeddings.get_embeddings_model( - embeddings_model_provider, embeddings_model_name) + embeddings_model_provider, embeddings_model_name + ) if not embeddings_model: raise genai_core.types.CommonError("Invalid embeddings model") # Verify that the embeddings model @@ -173,7 +175,8 @@ def create_workspace_open_search( timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ") embeddings_model = genai_core.embeddings.get_embeddings_model( - embeddings_model_provider, embeddings_model_name) + embeddings_model_provider, embeddings_model_name + ) if not embeddings_model: raise genai_core.types.CommonError("Invalid embeddings model") # Verify that the embeddings model @@ -263,3 +266,28 @@ def create_workspace_kendra(workspace_name: str, kendra_index: dict): return { "id": workspace_id, } + + +def delete_workspace(workspace_id: str): + response = table.get_item( + Key={"workspace_id": workspace_id, "object_type": WORKSPACE_OBJECT_TYPE} + ) + + item = response.get("Item") + + if not item: + raise genai_core.types.CommonError("Workspace not found") + + if item["status"] != "ready" and item["status"] != "error": + raise genai_core.types.CommonError("Workspace not ready for deletion") + + response = sfn_client.start_execution( + stateMachineArn=DELETE_WORKSPACE_WORKFLOW_ARN, + input=json.dumps( + { + "workspace_id": workspace_id, + } + ), + ) + + print(response) diff --git a/lib/user-interface/react-app/src/common/api-client/workspaces-client.ts b/lib/user-interface/react-app/src/common/api-client/workspaces-client.ts index c94340e55..faa713611 100644 --- a/lib/user-interface/react-app/src/common/api-client/workspaces-client.ts +++ b/lib/user-interface/react-app/src/common/api-client/workspaces-client.ts @@ -30,6 +30,22 @@ export class WorkspacesClient extends ApiClientBase { } } + async deleteWorkspace( + workspaceId: string + ): Promise> { + try { + const headers = await this.getHeaders(); + const result = await fetch(this.getApiUrl(`/workspaces/${workspaceId}`), { + headers, + method: "DELETE", + }); + + return result.json(); + } catch (error) { + return this.error(error); + } + } + async createAuroraWorkspace(params: { name: string; embeddingsModelProvider: string; diff --git a/lib/user-interface/react-app/src/common/constants.ts b/lib/user-interface/react-app/src/common/constants.ts index 3a534fcac..13ddd220f 100644 --- a/lib/user-interface/react-app/src/common/constants.ts +++ b/lib/user-interface/react-app/src/common/constants.ts @@ -46,6 +46,7 @@ export abstract class Labels { created: "success", processing: "in-progress", processed: "success", + deleting: "in-progress", error: "error", }; @@ -56,6 +57,7 @@ export abstract class Labels { created: "Created", processing: "Processing", processed: "Processed", + deleting: "Deleting", error: "Error", }; diff --git a/lib/user-interface/react-app/src/components/rag/workspace-delete-modal.tsx b/lib/user-interface/react-app/src/components/rag/workspace-delete-modal.tsx new file mode 100644 index 000000000..ed2cbc328 --- /dev/null +++ b/lib/user-interface/react-app/src/components/rag/workspace-delete-modal.tsx @@ -0,0 +1,59 @@ +import { + Modal, + Box, + SpaceBetween, + Button, + Alert, +} from "@cloudscape-design/components"; +import { WorkspaceItem } from "../../common/types"; + +export interface WorkspaceDeleteModalProps { + visible: boolean; + workspace?: WorkspaceItem; + onDelete: () => void; + onDiscard: () => void; +} + +export default function WorkspaceDeleteModal(props: WorkspaceDeleteModalProps) { + return ( + + + + + + + } + > + {props.workspace && ( + + + Permanently delete workspace{" "} + + {props.workspace.name} + + ? You can't undo this action. + + Worksapce Id: {props.workspace.id} + + Proceeding with this action will delete the workspace with all its + content. + + + )} + + ); +} diff --git a/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx b/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx index c5e9c2180..ad8ae871b 100644 --- a/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx +++ b/lib/user-interface/react-app/src/pages/rag/add-data/crawl-website.tsx @@ -70,7 +70,7 @@ export default function CrawlWebsite(props: CrawlWebsiteProps) { } } - if (form.limit < 1 || form.limit > 500) { + if (form.limit < 1 || form.limit > 1000) { errors.limit = "Page limit should be between 1 and 1000"; } diff --git a/lib/user-interface/react-app/src/pages/rag/workspaces/workspaces-page-header.tsx b/lib/user-interface/react-app/src/pages/rag/workspaces/workspaces-page-header.tsx index 9965fca7f..2423da244 100644 --- a/lib/user-interface/react-app/src/pages/rag/workspaces/workspaces-page-header.tsx +++ b/lib/user-interface/react-app/src/pages/rag/workspaces/workspaces-page-header.tsx @@ -5,8 +5,12 @@ import { SpaceBetween, } from "@cloudscape-design/components"; import RouterButton from "../../../components/wrappers/router-button"; -import { WorkspaceItem } from "../../../common/types"; +import { ResultValue, WorkspaceItem } from "../../../common/types"; import { useNavigate } from "react-router-dom"; +import { useContext, useState } from "react"; +import WorkspaceDeleteModal from "../../../components/rag/workspace-delete-modal"; +import { ApiClient } from "../../../common/api-client/api-client"; +import { AppContext } from "../../../common/app-context"; interface WorkspacesPageHeaderProps extends HeaderProps { title?: string; @@ -20,7 +24,13 @@ export function WorkspacesPageHeader({ ...props }: WorkspacesPageHeaderProps) { const navigate = useNavigate(); + const appContext = useContext(AppContext); + const [showDeleteModal, setShowDeleteModal] = useState(false); const isOnlyOneSelected = props.selectedWorkspaces.length === 1; + const canDeleteWorkspace = + props.selectedWorkspaces.length === 1 && + (props.selectedWorkspaces[0].status == "ready" || + props.selectedWorkspaces[0].status == "error"); const onRefreshClick = async () => { await props.getWorkspaces(); @@ -32,31 +42,67 @@ export function WorkspacesPageHeader({ navigate(`/rag/workspaces/${props.selectedWorkspaces[0].id}`); }; + const onDeleteClick = () => { + setShowDeleteModal(true); + }; + + const onDeleteWorksapce = async () => { + if (!appContext) return; + if (!isOnlyOneSelected) return; + + setShowDeleteModal(false); + const apiClient = new ApiClient(appContext); + const result = await apiClient.workspaces.deleteWorkspace( + props.selectedWorkspaces[0].id + ); + + if (ResultValue.ok(result)) { + setTimeout(async () => { + await props.getWorkspaces(); + }, 2500); + } + }; + return ( -
-
+ <> + setShowDeleteModal(false)} + onDelete={onDeleteWorksapce} + workspace={props.selectedWorkspaces[0]} + /> +
+
+ ); }