From e34fe7f50c3f5999c84137611b30d5a6b555ea40 Mon Sep 17 00:00:00 2001
From: Vaidas Jablonskis <jablonskis@gmail.com>
Date: Tue, 28 May 2024 15:13:47 +0300
Subject: [PATCH] executor/ai: implement cluster scan prompting (#35)

---
 cmd/executor/ai/main.go               |  2 +-
 internal/source/ai-brain/assistant.go | 53 ++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/cmd/executor/ai/main.go b/cmd/executor/ai/main.go
index a235c83..8f7ede0 100644
--- a/cmd/executor/ai/main.go
+++ b/cmd/executor/ai/main.go
@@ -77,7 +77,7 @@ func (e *AIFace) Execute(_ context.Context, in executor.ExecuteInput) (executor.
 	aiBrainWebhookURL := fmt.Sprintf("%s/%s", in.Context.IncomingWebhook.BaseSourceURL, cfg.AIBrainSourceName)
 
 	body, err := json.Marshal(aibrain.Payload{
-		Prompt:    strings.TrimPrefix(in.Command, pluginName),
+		Prompt:    strings.TrimSpace(strings.TrimPrefix(in.Command, pluginName)),
 		MessageID: in.Context.Message.ParentActivityID,
 	})
 	if err != nil {
diff --git a/internal/source/ai-brain/assistant.go b/internal/source/ai-brain/assistant.go
index 56f3719..560d2f6 100644
--- a/internal/source/ai-brain/assistant.go
+++ b/internal/source/ai-brain/assistant.go
@@ -28,12 +28,47 @@ import (
 )
 
 const (
-	cacheTTL                = 8 * time.Hour
-	openAIPollInterval      = 2 * time.Second
-	maxToolExecutionRetries = 3
-	quotaExceededErrCode    = "quota_exceeded"
-	tracerName              = "source.aibrain"
-	serviceName             = "botkube-plugins-source-ai-brain"
+	cacheTTL                  = 8 * time.Hour
+	openAIPollInterval        = 2 * time.Second
+	maxToolExecutionRetries   = 3
+	quotaExceededErrCode      = "quota_exceeded"
+	tracerName                = "source.aibrain"
+	serviceName               = "botkube-plugins-source-ai-brain"
+	clusterScanSubcommandName = "scan"
+
+	clusterScanPrompt = `
+Scan the Kubernetes cluster for critical issues that could significantly impact the cluster's health, stability, or security.
+Focus on problems that may not be immediately apparent through events or standard monitoring.
+
+Specific Checks:
+
+Pod Health:
+Identify pods in a crash-loop backoff state with a high restart count.
+Identify pods that have been OOMKilled (Out of Memory Killed) multiple times.
+Look for pods stuck in a pending state for an extended period.
+Resource Utilization:
+Identify nodes or pods with critically high CPU or memory usage (e.g., above 90% of limits).
+Check for critical resource starvation issues affecting multiple pods or namespaces.
+Configuration:
+Look for pods running with very insecure capabilities (e.g., ALL, NET_RAW, SYS_ADMIN).
+Identify pods using deprecated or insecure container images.
+Check for misconfigured network policies that could expose sensitive services.
+Networking:
+Identify pods or services experiencing significant network latency or packet loss.
+Check for network partitions or connectivity issues between critical components.
+
+Provide a concise overview of the scan results, including the total number of
+critical issues found. If there were no issues found for a specific check, do
+not include that section in the report. List the Kubernetes objects directly
+affected by the issue. Make sure that your checks are relevant to the current
+state of the cluster, do not include resources that no longer exist.
+
+Additional Guidance for the LLM Agent:
+
+Prioritize issues that pose the most immediate threat to the cluster's stability, performance, or security.
+Filter out informational or low-severity issues that are unlikely to cause major problems.
+Be as specific as possible in the descriptions. Do not exceed 2000 characters in your response.
+`
 )
 
 var temperature float32 = 0.1
@@ -111,6 +146,10 @@ func (i *assistant) handle(ctx context.Context, in source.ExternalRequestInput)
 		return api.NewPlaintextMessage("Please clarify your question.", false), nil
 	}
 
+	if strings.ToLower(p.Prompt) == clusterScanSubcommandName {
+		p.Prompt = clusterScanPrompt
+	}
+
 	go func() {
 		if err := i.handleThread(ctx, &p); err != nil {
 			i.out <- source.Event{Message: i.handleThreadError(p.MessageID, err)}
@@ -141,6 +180,8 @@ func (i *assistant) handleThread(ctx context.Context, p *Payload) (err error) {
 	ctx, span := i.tracer.Start(ctx, "aibrain.assistant.handleThread")
 	defer span.End()
 
+	span.SetAttributes(attribute.String("payload.prompt", p.Prompt))
+
 	var thread openai.Thread
 
 	defer func() {