Skip to content
This repository has been archived by the owner on Jun 9, 2024. It is now read-only.

Commit

Permalink
file_logs, whitelist added
Browse files Browse the repository at this point in the history
  • Loading branch information
SilenNaihin committed Sep 20, 2023
1 parent b7ca3ed commit 851c7f6
Show file tree
Hide file tree
Showing 32 changed files with 8,331 additions and 140 deletions.
348 changes: 314 additions & 34 deletions paper/all_data_jsons.json

Large diffs are not rendered by default.

74 changes: 37 additions & 37 deletions paper/analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 24,
"metadata": {},
"outputs": [
{
Expand All @@ -226,64 +226,64 @@
"Checking folder results/full runs/final_optimization_1...\n",
"Checking folder results/full runs/final_optimization_2...\n",
"For reasoning_included = 1:\n",
" Mean Accuracy: 78.3%\n",
" Mean Precision: 69.6%\n",
" Mean Recall: 27.900000000000002%\n",
" Mean Accuracy: 78.325%\n",
" Mean Precision: 69.648%\n",
" Mean Recall: 27.908%\n",
" Mean Counters: {'TP': 92.93333333333334, 'TN': 952.7, 'FP': 49.3, 'FN': 240.06666666666666}\n",
"For expert_prompt = 1:\n",
" Mean Accuracy: 76.8%\n",
" Mean Precision: 61.0%\n",
" Mean Recall: 20.9%\n",
" Mean Accuracy: 76.791%\n",
" Mean Precision: 61.045%\n",
" Mean Recall: 20.906%\n",
" Mean Counters: {'TP': 69.61538461538461, 'TN': 955.5384615384615, 'FP': 46.46153846153846, 'FN': 263.38461538461536}\n",
"For agent_explanation = 0:\n",
" Mean Accuracy: 76.7%\n",
" Mean Precision: 59.0%\n",
" Mean Recall: 19.6%\n",
" Mean Accuracy: 76.704%\n",
" Mean Precision: 59.022%\n",
" Mean Recall: 19.551%\n",
" Mean Counters: {'TP': 65.10344827586206, 'TN': 958.8965517241379, 'FP': 43.10344827586207, 'FN': 267.8965517241379}\n",
"For prompt_included = 0:\n",
" Mean Accuracy: 76.6%\n",
" Mean Precision: 61.0%\n",
" Mean Recall: 20.4%\n",
" Mean Accuracy: 76.561%\n",
" Mean Precision: 61.028%\n",
" Mean Recall: 20.42%\n",
" Mean Counters: {'TP': 68, 'TN': 954.0869565217391, 'FP': 47.91304347826087, 'FN': 265}\n",
"For task_context = 0:\n",
" Mean Accuracy: 76.5%\n",
" Mean Precision: 61.6%\n",
" Mean Recall: 18.5%\n",
" Mean Accuracy: 76.526%\n",
" Mean Precision: 61.62%\n",
" Mean Recall: 18.504%\n",
" Mean Counters: {'TP': 61.61904761904762, 'TN': 960, 'FP': 42, 'FN': 271.3809523809524}\n",
"For few_shot_examples = 0:\n",
" Mean Accuracy: 76.4%\n",
" Mean Precision: 57.49999999999999%\n",
" Mean Recall: 19.5%\n",
" Mean Accuracy: 76.401%\n",
" Mean Precision: 57.489%\n",
" Mean Recall: 19.495%\n",
" Mean Counters: {'TP': 64.92, 'TN': 955.04, 'FP': 46.96, 'FN': 268.08}\n",
"For few_shot_examples = 1:\n",
" Mean Accuracy: 76.1%\n",
" Mean Precision: 59.099999999999994%\n",
" Mean Recall: 21.099999999999998%\n",
" Mean Accuracy: 76.131%\n",
" Mean Precision: 59.087%\n",
" Mean Recall: 21.126%\n",
" Mean Counters: {'TP': 70.35, 'TN': 946, 'FP': 56, 'FN': 262.65}\n",
"For task_context = 1:\n",
" Mean Accuracy: 76.1%\n",
" Mean Precision: 55.2%\n",
" Mean Recall: 21.7%\n",
" Mean Accuracy: 76.067%\n",
" Mean Precision: 55.206%\n",
" Mean Recall: 21.722%\n",
" Mean Counters: {'TP': 72.33333333333333, 'TN': 943.1666666666666, 'FP': 58.833333333333336, 'FN': 260.6666666666667}\n",
"For prompt_included = 1:\n",
" Mean Accuracy: 76.0%\n",
" Mean Precision: 55.2%\n",
" Mean Recall: 20.0%\n",
" Mean Accuracy: 75.989%\n",
" Mean Precision: 55.242%\n",
" Mean Recall: 20.011%\n",
" Mean Counters: {'TP': 66.63636363636364, 'TN': 947.8181818181819, 'FP': 54.18181818181818, 'FN': 266.3636363636364}\n",
"For expert_prompt = 0:\n",
" Mean Accuracy: 75.6%\n",
" Mean Precision: 54.300000000000004%\n",
" Mean Recall: 19.3%\n",
" Mean Accuracy: 75.584%\n",
" Mean Precision: 54.305%\n",
" Mean Recall: 19.282%\n",
" Mean Counters: {'TP': 64.21052631578948, 'TN': 944.8421052631579, 'FP': 57.1578947368421, 'FN': 268.7894736842105}\n",
"For agent_explanation = 1:\n",
" Mean Accuracy: 75.5%\n",
" Mean Precision: 56.699999999999996%\n",
" Mean Recall: 21.4%\n",
" Mean Accuracy: 75.515%\n",
" Mean Precision: 56.708%\n",
" Mean Recall: 21.434%\n",
" Mean Counters: {'TP': 71.375, 'TN': 936.75, 'FP': 65.25, 'FN': 261.625}\n",
"For reasoning_included = 0:\n",
" Mean Accuracy: 72.2%\n",
" Mean Precision: 35.3%\n",
" Mean Recall: 4.8%\n",
" Mean Accuracy: 72.195%\n",
" Mean Precision: 35.302%\n",
" Mean Recall: 4.845%\n",
" Mean Counters: {'TP': 16.133333333333333, 'TN': 947.6666666666666, 'FP': 54.333333333333336, 'FN': 316.8666666666667}\n"
]
}
Expand Down
143 changes: 74 additions & 69 deletions paper/data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -48,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -76,7 +76,7 @@
" 'TestAgentProtocol_ListAgentTasksIds'], dtype=object)"
]
},
"execution_count": 54,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -87,7 +87,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -105,7 +105,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -378,7 +378,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -404,87 +404,88 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"from collections import OrderedDict\n",
"\n",
"challenge = \"TestRememberMultipleIds\"\n",
"# challenge = \"TestRememberMultipleIds\"\n",
"\n",
"# Loop through unique agents\n",
"for agent in agent_array:\n",
" \n",
" master_response_dict = OrderedDict()\n",
" master_response_nested_dict = OrderedDict()\n",
" master_actions_dict = OrderedDict()\n",
" master_request_dict = OrderedDict()\n",
" master_general_actions_dict = OrderedDict()\n",
" \n",
" # Replace with your actual DataFrame\n",
" selected_df = df.loc[(df['agent'] == agent) & (df['challenge'] == challenge)]\n",
"\n",
" # Group by 'benchmark_start_time'\n",
" grouped_df = selected_df.groupby('benchmark_start_time')\n",
"\n",
" for timestamp, group in grouped_df:\n",
" response_dict = OrderedDict()\n",
" response_nested_dict = OrderedDict()\n",
" actions_dict = OrderedDict()\n",
" request_dict = OrderedDict()\n",
" general_actions_dict = OrderedDict()\n",
"for challenge in challenges_array:\n",
" for agent in ['auto-gpt']: # agent_array:\n",
" \n",
" total_rows = len(group)\n",
" master_response_dict = OrderedDict()\n",
" master_response_nested_dict = OrderedDict()\n",
" master_actions_dict = OrderedDict()\n",
" master_request_dict = OrderedDict()\n",
" master_general_actions_dict = OrderedDict()\n",
" \n",
" for i, (_, row) in enumerate(group.iterrows()):\n",
" response = json.loads(row['response'])\n",
" request = row['request']\n",
" response_nested = nested_json(row['response'])\n",
" # Replace with your actual DataFrame\n",
" selected_df = df.loc[(df['agent'] == agent) & (df['challenge'] == challenge)]\n",
"\n",
" # Group by 'benchmark_start_time'\n",
" grouped_df = selected_df.groupby('benchmark_start_time')\n",
"\n",
" for timestamp, group in grouped_df:\n",
" response_dict = OrderedDict()\n",
" response_nested_dict = OrderedDict()\n",
" actions_dict = OrderedDict()\n",
" request_dict = OrderedDict()\n",
" general_actions_dict = OrderedDict()\n",
" \n",
" response_dict[str(total_rows-i)] = response\n",
" request_dict[str(total_rows-i)] = request\n",
" response_nested_dict[str(total_rows-i)] = response_nested\n",
" total_rows = len(group)\n",
" \n",
" if is_action_agent(response, agent, challenge, response):\n",
" actions_dict[str(total_rows-i)] = response\n",
" for i, (_, row) in enumerate(group.iterrows()):\n",
" response = json.loads(row['response'])\n",
" request = row['request']\n",
" response_nested = nested_json(row['response'])\n",
" \n",
" response_dict[str(total_rows-i)] = response\n",
" request_dict[str(total_rows-i)] = request\n",
" response_nested_dict[str(total_rows-i)] = response_nested\n",
" \n",
" if is_action_agent(response, agent, challenge, response):\n",
" actions_dict[str(total_rows-i)] = response\n",
"\n",
" if is_action_general(response):\n",
" general_actions_dict[str(total_rows-i)] = response\n",
" if is_action_general(response):\n",
" general_actions_dict[str(total_rows-i)] = response\n",
" \n",
" response_dict = OrderedDict(reversed(list(response_dict.items())))\n",
" response_nested_dict = OrderedDict(reversed(list(response_nested_dict.items())))\n",
" actions_dict = OrderedDict(reversed(list(actions_dict.items())))\n",
" request_dict = OrderedDict(reversed(list(request_dict.items())))\n",
" general_actions_dict = OrderedDict(reversed(list(general_actions_dict.items())))\n",
" \n",
" master_response_dict[str(timestamp)] = response_dict\n",
" master_response_nested_dict[str(timestamp)] = response_nested_dict\n",
" master_actions_dict[str(timestamp)] = actions_dict\n",
" master_request_dict[str(timestamp)] = request_dict\n",
" master_general_actions_dict[str(timestamp)] = general_actions_dict\n",
" \n",
" response_dict = OrderedDict(reversed(list(response_dict.items())))\n",
" response_nested_dict = OrderedDict(reversed(list(response_nested_dict.items())))\n",
" actions_dict = OrderedDict(reversed(list(actions_dict.items())))\n",
" request_dict = OrderedDict(reversed(list(request_dict.items())))\n",
" general_actions_dict = OrderedDict(reversed(list(general_actions_dict.items())))\n",
" os.makedirs(f'specific_logs/{challenge}', exist_ok=True)\n",
" os.makedirs(f'specific_logs/{challenge}/{agent}', exist_ok=True)\n",
" \n",
" master_response_dict[str(timestamp)] = response_dict\n",
" master_response_nested_dict[str(timestamp)] = response_nested_dict\n",
" master_actions_dict[str(timestamp)] = actions_dict\n",
" master_request_dict[str(timestamp)] = request_dict\n",
" master_general_actions_dict[str(timestamp)] = general_actions_dict\n",
" \n",
" os.makedirs(f'specific_logs/{challenge}', exist_ok=True)\n",
" os.makedirs(f'specific_logs/{challenge}/{agent}', exist_ok=True)\n",
" \n",
" with open(f'specific_logs/{challenge}/{agent}/response.json', 'w') as f:\n",
" json.dump(master_response_dict, f, indent=4)\n",
" with open(f'specific_logs/{challenge}/{agent}/response.json', 'w') as f:\n",
" json.dump(master_response_dict, f, indent=4)\n",
"\n",
" with open(f'specific_logs/{challenge}/{agent}/regex_specific.json', 'w') as f:\n",
" json.dump(master_actions_dict, f, indent=4)\n",
" # with open(f'specific_logs/{challenge}/{agent}/regex_specific.json', 'w') as f:\n",
" # json.dump(master_actions_dict, f, indent=4)\n",
"\n",
" with open(f'specific_logs/{challenge}/{agent}/response_nested.json', 'w') as f:\n",
" json.dump(master_response_nested_dict, f, indent=4)\n",
" \n",
" with open(f'specific_logs/{challenge}/{agent}/request.json', 'w') as f:\n",
" json.dump(master_request_dict, f, indent=4)\n",
" # with open(f'specific_logs/{challenge}/{agent}/response_nested.json', 'w') as f:\n",
" # json.dump(master_response_nested_dict, f, indent=4)\n",
" \n",
" # with open(f'specific_logs/{challenge}/{agent}/request.json', 'w') as f:\n",
" # json.dump(master_request_dict, f, indent=4)\n",
"\n",
" with open(f'specific_logs/{challenge}/{agent}/regex_simple.json', 'w') as f:\n",
" json.dump(master_general_actions_dict, f, indent=4)\n",
" \n",
" with open(f'specific_logs/{challenge}/{agent}/response_malicious.json', 'w') as f:\n",
" json.dump(master_response_nested_dict, f, indent=4)\n",
" # with open(f'specific_logs/{challenge}/{agent}/regex_simple.json', 'w') as f:\n",
" # json.dump(master_general_actions_dict, f, indent=4)\n",
" \n",
" # with open(f'specific_logs/{challenge}/{agent}/response_malicious.json', 'w') as f:\n",
" # json.dump(master_response_nested_dict, f, indent=4)\n",
"\n"
]
},
Expand Down Expand Up @@ -708,23 +709,27 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"beebot filtered logs\n",
"Total logs: 6067\n",
"Simple regex filter logs: 3875\n",
"Specific regex filter logs: 878\n",
"mini-agi filtered logs\n",
"Total logs: 5558\n",
"Simple regex filter logs: 4037\n",
"Specific regex filter logs: 0\n",
"polygpt filtered logs\n",
"Total logs: 3866\n",
"Simple regex filter logs: 1255\n",
"Specific regex filter logs: 0\n",
"turbo filtered logs\n",
"Total logs: 649\n",
"Simple regex filter logs: 400\n",
"Specific regex filter logs: 59\n"
]
Expand Down
Empty file added paper/prompt_iteration.ipynb
Empty file.
Loading

0 comments on commit 851c7f6

Please sign in to comment.