diff --git a/.github/workflows/autoblocks-replays.yml b/.github/workflows/autoblocks-replays.yml
index 6372bb8..2a66578 100644
--- a/.github/workflows/autoblocks-replays.yml
+++ b/.github/workflows/autoblocks-replays.yml
@@ -38,11 +38,6 @@ jobs:
poetry config virtualenvs.in-project true
poetry install
- - name: Install Autoblocks SDK
- run: |
- source ${{ github.workspace }}/.venv/bin/activate
- pip install git+https://nicolewhite:${{ secrets.NICOLES_GITHUB_TOKEN_DO_NOT_USE }}@github.com/autoblocksai/python-sdk.git@v0
-
- name: Start the app
run: poetry run start &
env:
@@ -69,13 +64,10 @@ jobs:
message: request.payload
mappers:
query: properties.payload.query
- __autoblocks_replay_trace_id: traceId
# Filter out properties that are expected to be different on each
# run to prevent the replay diffs from containing unnecessary noise
property-filter-config: |
- request.payload:
- - payload.__autoblocks_replay_trace_id
ai.intermediate.response:
- response.id
- response.created
diff --git a/.gitignore b/.gitignore
index 24c03fb..f667087 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
__pycache__/
.ruff_cache/
+.idea/
+
*.pyc
*.DS_Store
diff --git a/README.md b/README.md
index 4efb3ca..5a549e0 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,66 @@
# Autoblocks Replays
+This repository demonstrates how to integrate LLM chain replays into your code review process. It contains:
+
+* a [`simpleaichat`](https://github.com/minimaxir/simpleaichat) application that uses the [Autoblocks Python SDK](https://github.com/autoblocksai/python-sdk) to send events to the Autoblocks API
+* a GitHub Actions workflow that **replays** real, past events from end users on every push to a feature branch
+
+With our GitHub integration enabled, your teammates are not only reviewing your code, but also the impact that code will have on your LLM chains, and therefore your end users.
+
+
+
+Unlike other solutions for testing LLMs, Autoblocks Replays are end-to-end. They test your LLM chains from the moment a user sends an input to your application to the moment your application sends a response to the user. This means you can not only review changes to the final response to the user, but also any intermediate steps that might have
+changed along the way. This is especially useful for complicated chains that involve multiple services and multiple steps, e.g. if you're using a vector database, tool selection, etc. If you are only ever looking at the final response, it is hard to know which of the intermediate steps in your chain is causing the change.
+
+## Examples
+
+### Updating `simpleaichat`'s `character` input to `"Michael Scott"`
+
+* [Pull request](https://github.com/autoblocksai/demo-replays/pull/6)
+* [Replay results](https://github.com/autoblocksai/demo-replays/pull/6#issuecomment-1652606398)
+
+
+
+This small change leads to a large change in the final response to the user:
+
+
+
+It also doesn't sound like Michael Scott from The Office. Digging into the intermediate steps, we can see `simpleaichat` updated the prompt with character instructions, but with the wrong Michael Scott:
+
+
+
+### Increasing the `temperature` parameter
+
+* [Pull request](https://github.com/autoblocksai/demo-replays/pull/7)
+* [Replay results](https://github.com/autoblocksai/demo-replays/pull/7#issuecomment-1652649904)
+
+
+
+This change has pretty inoccuous results on the final response to the user. The model
+changes a few words here and there, but the messaging is very similar.
+
+Query about San Francisco:
+
+
+
+Query about highest points:
+
+
+
+### Changing the description of the tools
+
+* [Pull request](https://github.com/autoblocksai/demo-replays/pull/2)
+* [Replay results](https://github.com/autoblocksai/demo-replays/pull/2#issuecomment-1652129031)
+
+Autoblocks helps you better understand how your code changes affect the intermediate steps in your chain, especially if you're using a wrapper like `simpleaichat` or `LangChain`, both of which are higher level wrappers around calls to LLMs. For example, perhaps a teammate has not fully read the `simpleaichat` documentation and doesn't realize that the
+doc strings of the functions passed to the `tools` array are actually used in the prompts!
+
+
+
+Autoblocks would easily surface this change during the code review process:
+
+
+
## Replaying Locally
Start the application with replays enabled:
@@ -21,9 +82,7 @@ poetry run replay --view-id clkeamsei0001l908cmjjtqrf --num-traces 3
```
```
-################################################################################
Your replay id is 2023-07-23_09-36-36
-################################################################################
Replaying event {'id': 'geepag24zence2kbe0ppagt9', 'traceId': '7cb3ec98-b320-4e62-9a51-b15d0218ae4c', 'timestamp': '2023-07-22T18:32:51.862Z', 'message': 'request.payload', 'properties': {'payload': {'query': 'What are all of the airports in London?'}, 'source': 'DEMO_REPLAYS'}}
```
@@ -56,10 +115,6 @@ diff \
Use the [`autoblocksai/actions/replay`](https://github.com/autoblocksai/actions/tree/main/replay) action to replay events in a GitHub Actions workflow. This is similar to replaying events locally but allows you to automate replays in your CI workflow and view results in the GitHub UI.
-The action will leave a comment on your commit with a summary of the replay results:
+The action will leave a comment on your pull request with a summary of the replay results:
-
-
-You can view diffs of individual events or entire traces:
-
-
+
diff --git a/demo_replays/app.py b/demo_replays/app.py
index 859322c..fbc426d 100644
--- a/demo_replays/app.py
+++ b/demo_replays/app.py
@@ -5,7 +5,7 @@
from flask import request
from demo_replays import bot
-from demo_replays.settings import AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME
+from demo_replays.settings import AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME
from demo_replays.settings import env
app = Flask(__name__)
@@ -26,7 +26,7 @@ def main():
# In production we generate a new trace id for each request,
# but in a replay scenario we use the trace id passed in from the replay
- trace_id = payload.get(AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME) or str(uuid.uuid4())
+ trace_id = request.headers.get(AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME) or str(uuid.uuid4())
autoblocks = AutoblocksTracer(
env.AUTOBLOCKS_INGESTION_KEY, trace_id=trace_id, properties=dict(source="DEMO_REPLAYS")
diff --git a/demo_replays/replay.py b/demo_replays/replay.py
index 5dd14ea..0e1f60a 100644
--- a/demo_replays/replay.py
+++ b/demo_replays/replay.py
@@ -4,7 +4,7 @@
from autoblocks.replays import replay_events_from_view
from autoblocks.replays import start_replay
-from demo_replays.settings import AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME
+from demo_replays.settings import AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME
from demo_replays.settings import env
@@ -21,7 +21,11 @@ def static():
("eiffel", "Eiffel Tower"),
]:
print(f"Testing static event {trace_id} - {query}")
- requests.post("http://localhost:5000", json={AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME: trace_id, "query": query})
+ requests.post(
+ "http://localhost:5000",
+ json={"query": query},
+ headers={AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME: trace_id},
+ )
def dynamic():
@@ -51,8 +55,9 @@ def dynamic():
# The original payload
payload = event.properties["payload"]
- # Modify the payload to pass in the replay trace id
- payload[AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME] = event.trace_id
-
# Replay the request
- requests.post("http://localhost:5000", json=payload)
+ requests.post(
+ "http://localhost:5000",
+ json=payload,
+ headers={AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME: event.trace_id},
+ )
diff --git a/demo_replays/settings.py b/demo_replays/settings.py
index 0421aa8..71347ed 100644
--- a/demo_replays/settings.py
+++ b/demo_replays/settings.py
@@ -1,9 +1,8 @@
from pydantic_settings import BaseSettings
-# A hidden param that is used to override the trace id that would usually
-# be randomly generated for each request with the trace id of the event
-# that is being replayed
-AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME = "__autoblocks_replay_trace_id"
+# When a request is from a replay, this header contains the trace ID of
+# the event being replayed.
+AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME = "x-autoblocks-replay-trace-id"
# Environment variables
diff --git a/poetry.lock b/poetry.lock
index db686af..afcce4e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -31,6 +31,20 @@ doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-
test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
trio = ["trio (<0.22)"]
+[[package]]
+name = "autoblocksai"
+version = "0.0.1"
+description = ""
+optional = false
+python-versions = ">=3.8,<4.0"
+files = [
+ {file = "autoblocksai-0.0.1-py3-none-any.whl", hash = "sha256:353f12235dab4b7400b40f9e0f47fd448f6f7b59f23bd94df4a2b8dd04dd850b"},
+ {file = "autoblocksai-0.0.1.tar.gz", hash = "sha256:a33b3592dd5204a7969d87ea6348df85ef887dfe780c832d4a07faf282eb216c"},
+]
+
+[package.dependencies]
+httpx = ">=0.24.0,<0.25.0"
+
[[package]]
name = "blinker"
version = "1.6.2"
@@ -955,4 +969,4 @@ watchdog = ["watchdog (>=2.3)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
-content-hash = "fb85e6b4b9d3fe883206d9c505610732cecd62d0bc48e84dfe9b7ef7ef9ba779"
+content-hash = "3f8570fdf16d701a88304bfb9aa1b6bc733c981cf61d72dd99ff65c6ae1129ec"
diff --git a/pyproject.toml b/pyproject.toml
index c493fa8..6f8fd8d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ requests = "^2.31.0"
flask = "^2.3.2"
pydantic-settings = "^2.0.2"
simpleaichat = "^0.2.2"
-# autoblocksai = { git = "https://github.com/autoblocksai/python-sdk.git", branch = "v0" }
+autoblocksai = "^0.0.1"
[tool.poetry.group.dev.dependencies]
pre-commit = "^3.3.3"