diff --git a/.github/workflows/autoblocks-replays.yml b/.github/workflows/autoblocks-replays.yml index 6372bb8..2a66578 100644 --- a/.github/workflows/autoblocks-replays.yml +++ b/.github/workflows/autoblocks-replays.yml @@ -38,11 +38,6 @@ jobs: poetry config virtualenvs.in-project true poetry install - - name: Install Autoblocks SDK - run: | - source ${{ github.workspace }}/.venv/bin/activate - pip install git+https://nicolewhite:${{ secrets.NICOLES_GITHUB_TOKEN_DO_NOT_USE }}@github.com/autoblocksai/python-sdk.git@v0 - - name: Start the app run: poetry run start & env: @@ -69,13 +64,10 @@ jobs: message: request.payload mappers: query: properties.payload.query - __autoblocks_replay_trace_id: traceId # Filter out properties that are expected to be different on each # run to prevent the replay diffs from containing unnecessary noise property-filter-config: | - request.payload: - - payload.__autoblocks_replay_trace_id ai.intermediate.response: - response.id - response.created diff --git a/.gitignore b/.gitignore index 24c03fb..f667087 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ __pycache__/ .ruff_cache/ +.idea/ + *.pyc *.DS_Store diff --git a/README.md b/README.md index 4efb3ca..5a549e0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,66 @@ # Autoblocks Replays +This repository demonstrates how to integrate LLM chain replays into your code review process. It contains: + +* a [`simpleaichat`](https://github.com/minimaxir/simpleaichat) application that uses the [Autoblocks Python SDK](https://github.com/autoblocksai/python-sdk) to send events to the Autoblocks API +* a GitHub Actions workflow that **replays** real, past events from end users on every push to a feature branch + +With our GitHub integration enabled, your teammates are not only reviewing your code, but also the impact that code will have on your LLM chains, and therefore your end users. + +Screenshot 2023-07-23 at 11 02 18 AM + +Unlike other solutions for testing LLMs, Autoblocks Replays are end-to-end. They test your LLM chains from the moment a user sends an input to your application to the moment your application sends a response to the user. This means you can not only review changes to the final response to the user, but also any intermediate steps that might have +changed along the way. This is especially useful for complicated chains that involve multiple services and multiple steps, e.g. if you're using a vector database, tool selection, etc. If you are only ever looking at the final response, it is hard to know which of the intermediate steps in your chain is causing the change. + +## Examples + +### Updating `simpleaichat`'s `character` input to `"Michael Scott"` + +* [Pull request](https://github.com/autoblocksai/demo-replays/pull/6) +* [Replay results](https://github.com/autoblocksai/demo-replays/pull/6#issuecomment-1652606398) + +Screenshot 2023-07-26 at 6 24 27 PM + +This small change leads to a large change in the final response to the user: + +Screenshot 2023-07-26 at 6 44 16 PM + +It also doesn't sound like Michael Scott from The Office. Digging into the intermediate steps, we can see `simpleaichat` updated the prompt with character instructions, but with the wrong Michael Scott: + +Screenshot 2023-07-26 at 6 46 32 PM + +### Increasing the `temperature` parameter + +* [Pull request](https://github.com/autoblocksai/demo-replays/pull/7) +* [Replay results](https://github.com/autoblocksai/demo-replays/pull/7#issuecomment-1652649904) + +Screenshot 2023-07-26 at 6 57 27 PM + +This change has pretty inoccuous results on the final response to the user. The model +changes a few words here and there, but the messaging is very similar. + +Query about San Francisco: + +Screenshot 2023-07-26 at 6 59 17 PM + +Query about highest points: + +Screenshot 2023-07-26 at 7 00 13 PM + +### Changing the description of the tools + +* [Pull request](https://github.com/autoblocksai/demo-replays/pull/2) +* [Replay results](https://github.com/autoblocksai/demo-replays/pull/2#issuecomment-1652129031) + +Autoblocks helps you better understand how your code changes affect the intermediate steps in your chain, especially if you're using a wrapper like `simpleaichat` or `LangChain`, both of which are higher level wrappers around calls to LLMs. For example, perhaps a teammate has not fully read the `simpleaichat` documentation and doesn't realize that the +doc strings of the functions passed to the `tools` array are actually used in the prompts! + +Screenshot 2023-07-26 at 7 12 01 PM + +Autoblocks would easily surface this change during the code review process: + +Screenshot 2023-07-26 at 7 11 28 PM + ## Replaying Locally Start the application with replays enabled: @@ -21,9 +82,7 @@ poetry run replay --view-id clkeamsei0001l908cmjjtqrf --num-traces 3 ``` ``` -################################################################################ Your replay id is 2023-07-23_09-36-36 -################################################################################ Replaying event {'id': 'geepag24zence2kbe0ppagt9', 'traceId': '7cb3ec98-b320-4e62-9a51-b15d0218ae4c', 'timestamp': '2023-07-22T18:32:51.862Z', 'message': 'request.payload', 'properties': {'payload': {'query': 'What are all of the airports in London?'}, 'source': 'DEMO_REPLAYS'}} ``` @@ -56,10 +115,6 @@ diff \ Use the [`autoblocksai/actions/replay`](https://github.com/autoblocksai/actions/tree/main/replay) action to replay events in a GitHub Actions workflow. This is similar to replaying events locally but allows you to automate replays in your CI workflow and view results in the GitHub UI. -The action will leave a comment on your commit with a summary of the replay results: +The action will leave a comment on your pull request with a summary of the replay results: -Screenshot 2023-07-23 at 11 49 25 AM - -You can view diffs of individual events or entire traces: - -Screenshot 2023-07-23 at 11 02 18 AM +Screenshot 2023-07-26 at 6 50 31 PM diff --git a/demo_replays/app.py b/demo_replays/app.py index 859322c..fbc426d 100644 --- a/demo_replays/app.py +++ b/demo_replays/app.py @@ -5,7 +5,7 @@ from flask import request from demo_replays import bot -from demo_replays.settings import AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME +from demo_replays.settings import AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME from demo_replays.settings import env app = Flask(__name__) @@ -26,7 +26,7 @@ def main(): # In production we generate a new trace id for each request, # but in a replay scenario we use the trace id passed in from the replay - trace_id = payload.get(AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME) or str(uuid.uuid4()) + trace_id = request.headers.get(AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME) or str(uuid.uuid4()) autoblocks = AutoblocksTracer( env.AUTOBLOCKS_INGESTION_KEY, trace_id=trace_id, properties=dict(source="DEMO_REPLAYS") diff --git a/demo_replays/replay.py b/demo_replays/replay.py index 5dd14ea..0e1f60a 100644 --- a/demo_replays/replay.py +++ b/demo_replays/replay.py @@ -4,7 +4,7 @@ from autoblocks.replays import replay_events_from_view from autoblocks.replays import start_replay -from demo_replays.settings import AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME +from demo_replays.settings import AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME from demo_replays.settings import env @@ -21,7 +21,11 @@ def static(): ("eiffel", "Eiffel Tower"), ]: print(f"Testing static event {trace_id} - {query}") - requests.post("http://localhost:5000", json={AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME: trace_id, "query": query}) + requests.post( + "http://localhost:5000", + json={"query": query}, + headers={AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME: trace_id}, + ) def dynamic(): @@ -51,8 +55,9 @@ def dynamic(): # The original payload payload = event.properties["payload"] - # Modify the payload to pass in the replay trace id - payload[AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME] = event.trace_id - # Replay the request - requests.post("http://localhost:5000", json=payload) + requests.post( + "http://localhost:5000", + json=payload, + headers={AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME: event.trace_id}, + ) diff --git a/demo_replays/settings.py b/demo_replays/settings.py index 0421aa8..71347ed 100644 --- a/demo_replays/settings.py +++ b/demo_replays/settings.py @@ -1,9 +1,8 @@ from pydantic_settings import BaseSettings -# A hidden param that is used to override the trace id that would usually -# be randomly generated for each request with the trace id of the event -# that is being replayed -AUTOBLOCKS_REPLAYS_TRACE_ID_PARAM_NAME = "__autoblocks_replay_trace_id" +# When a request is from a replay, this header contains the trace ID of +# the event being replayed. +AUTOBLOCKS_REPLAY_TRACE_ID_HEADER_NAME = "x-autoblocks-replay-trace-id" # Environment variables diff --git a/poetry.lock b/poetry.lock index db686af..afcce4e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -31,6 +31,20 @@ doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd- test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] trio = ["trio (<0.22)"] +[[package]] +name = "autoblocksai" +version = "0.0.1" +description = "" +optional = false +python-versions = ">=3.8,<4.0" +files = [ + {file = "autoblocksai-0.0.1-py3-none-any.whl", hash = "sha256:353f12235dab4b7400b40f9e0f47fd448f6f7b59f23bd94df4a2b8dd04dd850b"}, + {file = "autoblocksai-0.0.1.tar.gz", hash = "sha256:a33b3592dd5204a7969d87ea6348df85ef887dfe780c832d4a07faf282eb216c"}, +] + +[package.dependencies] +httpx = ">=0.24.0,<0.25.0" + [[package]] name = "blinker" version = "1.6.2" @@ -955,4 +969,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "fb85e6b4b9d3fe883206d9c505610732cecd62d0bc48e84dfe9b7ef7ef9ba779" +content-hash = "3f8570fdf16d701a88304bfb9aa1b6bc733c981cf61d72dd99ff65c6ae1129ec" diff --git a/pyproject.toml b/pyproject.toml index c493fa8..6f8fd8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ requests = "^2.31.0" flask = "^2.3.2" pydantic-settings = "^2.0.2" simpleaichat = "^0.2.2" -# autoblocksai = { git = "https://github.com/autoblocksai/python-sdk.git", branch = "v0" } +autoblocksai = "^0.0.1" [tool.poetry.group.dev.dependencies] pre-commit = "^3.3.3"