robusta-dev · aantn · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/README.md b/README.md
@@ -237,9 +237,30 @@ In particular, note that [vLLM does not yet support function calling](https://gi
 
 </details>
 
+### Enabling Integrations
+
+<details>
+<summary>Confluence</summary>
+HolmesGPT can read runbooks from Confluence. To give it access, set the following environment variables:
+
+* CONFLUENCE_BASE_URL - e.g. https://robusta-dev-test.atlassian.net
+* CONFLUENCE_USER - e.g. [email protected]
+* CONFLUENCE_API_KEY - [refer to Atlassian docs on generating API keys](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/)
+</details>
+
+<details>
+<summary>
+Jira, GitHub, OpsGenie, PagerDuty, and AlertManager
+</summary>
+
+HolmesGPT can pull tickets/alerts from each of these sources and investigate them.
+
+Refer to `holmes investigate jira --help` etc for details, or view the <a href="#examples">examples</a>.
+</details>
+
 ## Other Use Cases
 
-HolmesGPT is usually used for incident response, but it can function as a general-purpose DevOps assistant too. Here are some examples:
+HolmesGPT was designed for incident response, but it is a general DevOps assistant too. Here are some examples:
 
 <details>
 <summary>Ask Questions About Your Cloud</summary>

diff --git a/holmes/core/issue.py b/holmes/core/issue.py
@@ -20,7 +20,7 @@ class Issue(BaseModel):
     # Name of the issue - not necessarily unique  
     name: str                                      
 
-    # Source of the issue - e.g. Jira
+    # Source of the issue - e.g. jira
     source_type: str
 
     # Identifier for the instance of the source - e.g. Jira project key                                

diff --git a/holmes/core/tools.py b/holmes/core/tools.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import re
 import shlex
 import subprocess
@@ -115,15 +116,18 @@ def __execute_subprocess(self, cmd) -> str:
             return f"Command `{cmd}` failed with return code {e.returncode}\nstdout:\n{e.stdout}\nstderr:\n{e.stderr}"
 
 
-class ToolsetPrerequisite(BaseModel):
+class ToolsetCommandPrerequisite(BaseModel):
     command: str                 # must complete successfully (error code 0) for prereq to be satisfied
     expected_output: str = None  # optional
 
+class ToolsetEnvironmentPrerequisite(BaseModel):
+    env: List[str] = []          # optional
+
 class Toolset(BaseModel):
     model_config = ConfigDict(extra='forbid')
 
     name: str
-    prerequisites: List[ToolsetPrerequisite] = []
+    prerequisites: List[Union[ToolsetCommandPrerequisite, ToolsetEnvironmentPrerequisite]] = []
     tools: List[YAMLTool]
 
     _path: PrivateAttr = None
@@ -148,17 +152,24 @@ def get_disabled_reason(self):
 
     def check_prerequisites(self):
         for prereq in self.prerequisites:
-            try:
-                result = subprocess.run(prereq.command, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-                if prereq.expected_output and prereq.expected_output not in result.stdout:
+            if isinstance(prereq, ToolsetCommandPrerequisite):
+                try:
+                    result = subprocess.run(prereq.command, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                    if prereq.expected_output and prereq.expected_output not in result.stdout:
+                        self._enabled = False
+                        self._disabled_reason = f"prereq check gave wrong output"
+                        return
+                except subprocess.CalledProcessError as e:
                     self._enabled = False
-                    self._disabled_reason = f"prereq check gave wrong output"
-                    return 
-            except subprocess.CalledProcessError as e:
-                self._enabled = False
-                self._disabled_reason = f"prereq check failed w/ errorcode {e.returncode}"
-                logging.debug(f"Toolset {self.name} : Failed to run prereq command {prereq}", exc_info=True)
-                return
+                    self._disabled_reason = f"prereq check failed with errorcode {e.returncode}"
+                    logging.debug(f"Toolset {self.name} : Failed to run prereq command {prereq}", exc_info=True)
+                    return
+            elif isinstance(prereq, ToolsetEnvironmentPrerequisite):
+                for env_var in prereq.env:
+                    if env_var not in os.environ:
+                        self._enabled = False
+                        self._disabled_reason = f"prereq check failed because environment variable {env_var} was not set"
+                        return
         self._enabled = True
 
 class YAMLToolExecutor:

diff --git a/holmes/plugins/prompts/generic_ask.jinja2 b/holmes/plugins/prompts/generic_ask.jinja2
@@ -24,7 +24,7 @@ Examples:
 
 User: Why did the webserver-example app crash?
 (Call tool kubectl_find_resource kind=pod keyword=webserver`)
-(Call tool kubectl_logs_previous namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
+(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)
 
 AI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
 Relevant logs:

diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2
@@ -15,8 +15,13 @@ Example investigation for a NodeUnavailableAlert:
 *Details:* Node `name-of-node` has 2.3% disk space remaining, causing the node to be unavailable for scheduling pods.
 
 If there are other resources that are impacted (other than the direct resource mentioned in the alert) list them as well under Resource.
-Whenever there are precise numbers in the data available, quote them.
-(E.g. don't say an app is repeatedly crashing, rather say the app has crashed X times so far. But only quote relevant numbers or metrics.)
+Whenever there are precise numbers in the data available, quote them. For example:
+* Don't say an app is repeatedly crashing, rather say the app has crashed X times so far
+* Don't just say x/y nodes don't match a pod's affinity selector, rather say x/y nodes don't match the selector ABC
+* And so on
+But only quote relevant numbers or metrics that are available. Do not guess.
+
+If a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation.
 
 When it can provide extra information, first run as many tools as you need to gather more information, then respond. 
 If possible, do so repeatedly on different IT resources.
@@ -25,7 +30,7 @@ You must use tools to investigate whenever possible.
 When investigating Kubernetes problems, run as many kubectl commands as you need to gather more information, then respond.
 If possible, do so repeatedly on different Kubernetes objects.
 For example, for deployments first run kubectl on the deployment then a replicaset inside it, then a pod inside that.
-When investigating a pod that crashed, fetch pods logs with --previous so you see logs from before the crash.
+Do not fetch logs for a pod that crashed with kubectl_logs, use the kubectl_previous_logs tool instead
 
 If you don't know, just say that the analysis was inconclusive.
 If there are multiple possible causes list them in a numbered list.
@@ -54,8 +59,20 @@ Remove every unnecessary word.
 *Surround the title of the root cause like this*. 
 Do not use markdown other than what is described above.
 
+Examples of tool usage:
+
+User: Why did the webserver-example app crash?
+(Call tool kubectl_find_resource kind=pod keyword=webserver`)
+(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call and we use previous whenever investigating a crash)
+
+*Email validation error during for /api/create_user*
+*Resource:* `webserver-example-1299492-d9g9d` in namespace `web`
+*Details:* Validation error led to unhandled Java exception causing a crash: `2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body`
+
+End of Examples
+
 {% if runbooks %}
-Here are runbooks for this specific investigation. Please follow them if relevant.
+Here are runbooks for this specific investigation. Please follow them if relevant. THIS IS NOT IN PLACE OF RUNNING TOOLS!
 {% for r in runbooks %}
 * {{ r }}
 {% endfor %}

diff --git a/holmes/plugins/runbooks/jira.yaml b/holmes/plugins/runbooks/jira.yaml
@@ -0,0 +1,12 @@
+# runbooks for jira alerts
+# the AI will follow the instructions inside these runbooks to investigate alerts!
+# please feel free to open PRs adding your own runboks
+runbooks:
+  - match:
+      source: "jira"
+    instructions: >
+      Investigate and try to solve whatever is written in the title and description of the ticket.
+      Ignore issues related to jira itself, like plugin or licensing problems.
+      Never give an answer like "XYZ is experiencing an issue, as indicated by the Jira issue. Further investigation is needed to determine the exact cause."
+      You are the agent that is supposed to investigate so do so!
+      If you have references to a service or a component, start by searching for related infrastructure or resources using tools that take keywords
diff --git a/holmes/plugins/sources/pagerduty/__init__.py b/holmes/plugins/sources/pagerduty/__init__.py
@@ -50,7 +50,7 @@ def convert_to_issue(self, source_issue):
         return Issue(
             id=source_issue["id"],
             name=source_issue["summary"],
-            source_type="PagerDuty",
+            source_type="pagerduty",
             source_instance_id=self.api_url,
             url=f"{source_issue['html_url']}",
             raw=source_issue,

diff --git a/holmes/plugins/toolsets/__init__.py b/holmes/plugins/toolsets/__init__.py
@@ -6,7 +6,7 @@
 
 from pydantic import BaseModel
 
-from holmes.core.tools import Toolset, ToolsetPrerequisite
+from holmes.core.tools import Toolset
 from holmes.utils.pydantic_utils import load_model_from_file
 
 THIS_DIR = os.path.abspath(os.path.dirname(__file__))

diff --git a/holmes/plugins/toolsets/external-knowledge.yaml b/holmes/plugins/toolsets/external-knowledge.yaml
@@ -0,0 +1,38 @@
+toolsets:
+- name: "confluence"
+  prerequisites:
+  - command: "curl --version"
+  - env:
+    - CONFLUENCE_USER
+    - CONFLUENCE_API_KEY
+    - CONFLUENCE_BASE_URL
+
+  tools:
+  - name: "fetch_confluence_url"
+    description: "Fetch a page in confluence.  Use this to fetch confluence runbooks if they are present before starting your investigation."
+    command: "curl -u ${CONFLUENCE_USER}:${CONFLUENCE_API_KEY} -X GET -H 'Content-Type: application/json' ${CONFLUENCE_BASE_URL}/wiki/rest/api/content/{{ confluence_page_id }}?expand=body.storage"
+
+
+- name: "internet"
+  prerequisites:
+  - command: "w3m -version"
+  tools:
+  - name: "fetch_webpage"
+    description: "Fetch a webpage with w3m. Use this to fetch runbooks if they are present before starting your investigation (if no other tool like confluence is more appropriate)"
+    command: "w3m -dump {{ url }}"
+
+
+- name: "slab"
+  prerequisites:
+  - command: "curl --version"
+  - env:
+    - SLAB_API_KEY
+  tools:
+  - name: "fetch_slab_document"
+    description: "Fetch a document from slab. Use this to fetch runbooks if they are present before starting your investigation."
+    command: |
+      curl -X POST \
+        -H "Authorization: ${SLAB_API_KEY}" \
+        -H "Content-Type: application/json" \
+        -d '{"query":"query { post(id: \"{{ post_id }}\") { id title content } }"}' \
+        https://api.slab.com/v1/graphql