Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BFCL] Chore: Refactor File Path Handling and Automate apply_function_credential_config.py #675

Merged
merged 5 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions berkeley-function-call-leaderboard/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,7 @@ To run the executable test categories, there are 4 API keys to include:
3. OMDB API: http://www.omdbapi.com/apikey.aspx
4. Geocode API: https://geocode.maps.co/

The `apply_function_credential_config.py` will automatically search for dataset files in the default `./data/` directory and replace the placeholder values with the actual API keys.
After you have filled in the necessary values in the `.env` file, you can run the following command to apply the real API keys to the dataset files.

```bash
python apply_function_credential_config.py
```
The evaluation script will automatically search for dataset files in the default `./data/` directory and replace the placeholder values with the actual API keys you provided in the `.env` file.

## Evaluating different models on the BFCL

Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,12 @@
import glob
import json
import argparse
import os
from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError
from dotenv import load_dotenv

parser = argparse.ArgumentParser(description="Replace placeholders in the function credential config file.")
parser.add_argument("--input-path", help="Path to the function credential config file. Can be a file or a directory.")
parser.add_argument("--output-path", help="Path to the output file.")
args = parser.parse_args()
from bfcl.constant import PROMPT_PATH
from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError

# Load the actual API keys
load_dotenv()
ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY")
PLACEHOLDERS = {}
for var in ENV_VARS:
if os.getenv(var) == "":
raise NoAPIKeyError()

PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)


def replace_placeholders(data):
Expand Down Expand Up @@ -57,21 +45,20 @@ def process_file(input_file_path, output_file_path):
# Handle the case where a line is not a valid JSON object
print("Invalid JSON line skipped.")
continue

# Write the modified data to the output file
with open(output_file_path, "w") as f:
for i, modified_line in enumerate(modified_data):
f.write(modified_line)
if i < len(modified_data) - 1: # Check against the length of modified_data
f.write("\n")
print(f"All placeholders have been replaced for {input_file_path} 🦍.")


f.write("\n")


def process_dir(input_dir, output_dir):
# This function does not support nested directories
# To support nested directories, refer to this commit:
# https://github.com/ShishirPatil/gorilla/pull/508/commits/8b1e35590e5bce3bd52a7c6405775b1ce4a64945
print(f"Input directory: {input_dir}")

# Get a list of all entries in the folder
entries = os.scandir(input_dir)

Expand All @@ -80,24 +67,23 @@ def process_dir(input_dir, output_dir):
file_name = os.path.basename(input_file_path)
output_file_path = os.path.join(output_dir, file_name)
process_file(input_file_path, output_file_path)


if __name__ == "__main__":
# Verify all values are provided
for key, value in PLACEHOLDERS.items():
if value == "":

def apply_function_credential_config(input_path=None, output_path=None):
# Load the actual API keys, and verify that they are present
for var in ENV_VARS:
if var not in os.environ or not os.getenv(var):
raise NoAPIKeyError()
print("All API keys are present.")

input_path = args.input_path
PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)

if input_path is None:
input_path = "./data/"

output_path = args.output_path
input_path = PROMPT_PATH

if output_path is None:
output_path = input_path

if os.path.isdir(input_path):
process_dir(input_path, output_path)
else:
process_file(input_path, output_path)
process_file(input_path, output_path)
print("All placeholders API keys have been replaced. 🦍")
22 changes: 22 additions & 0 deletions berkeley-function-call-leaderboard/bfcl/constant.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
from pathlib import Path

# NOTE: These paths are relative to the `bfcl` directory where this script is located.
RESULT_PATH = "../result/"
PROMPT_PATH = "../data/"
POSSIBLE_ANSWER_PATH = "../data/possible_answer/"
SCORE_PATH = "../score/"
DOTENV_PATH = "../.env"

VERSION_PREFIX = "BFCL_v3"

# These are in the PROMPT_PATH
TEST_FILE_MAPPING = {
"exec_simple": f"{VERSION_PREFIX}_exec_simple.json",
"exec_parallel": f"{VERSION_PREFIX}_exec_parallel.json",
Expand Down Expand Up @@ -164,3 +174,15 @@
"live_relevance",
],
}


# Construct the full path to use by other scripts
script_dir = Path(__file__).parent
RESULT_PATH = (script_dir / RESULT_PATH).resolve()
PROMPT_PATH = (script_dir / PROMPT_PATH).resolve()
POSSIBLE_ANSWER_PATH = (script_dir / POSSIBLE_ANSWER_PATH).resolve()
SCORE_PATH = (script_dir / SCORE_PATH).resolve()
DOTENV_PATH = (script_dir / DOTENV_PATH).resolve()

RESULT_PATH.mkdir(parents=True, exist_ok=True)
SCORE_PATH.mkdir(parents=True, exist_ok=True)
12 changes: 12 additions & 0 deletions berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from pathlib import Path

REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2

# These two files are for the API status sanity check
REST_API_GROUND_TRUTH_FILE_PATH = (
"./executable_eval/data/api_status_check_ground_truth_REST.json"
)
EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = (
"./executable_eval/data/api_status_check_ground_truth_executable.json"
)

# This is the ground truth file for the `rest` test category
REST_EVAL_GROUND_TRUTH_PATH = "./executable_eval/data/rest-eval-response_v5.jsonl"

COLUMNS_NON_LIVE = [
"Rank",
"Model",
Expand Down Expand Up @@ -99,3 +105,9 @@

RED_FONT = "\033[91m"
RESET = "\033[0m"

# Construct the full path for other modules to use
script_dir = Path(__file__).parent
REST_API_GROUND_TRUTH_FILE_PATH = (script_dir / REST_API_GROUND_TRUTH_FILE_PATH).resolve()
EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = (script_dir / EXECTUABLE_API_GROUND_TRUTH_FILE_PATH).resolve()
REST_EVAL_GROUND_TRUTH_PATH = (script_dir / REST_EVAL_GROUND_TRUTH_PATH).resolve()
55 changes: 26 additions & 29 deletions berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
import argparse

from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, VERSION_PREFIX
from bfcl.constant import (
DOTENV_PATH,
POSSIBLE_ANSWER_PATH,
PROMPT_PATH,
RESULT_PATH,
SCORE_PATH,
TEST_COLLECTION_MAPPING,
TEST_FILE_MAPPING,
VERSION_PREFIX,
)
from bfcl.eval_checker.ast_eval.ast_checker import ast_checker
from bfcl.eval_checker.eval_runner_helper import *
from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError
Expand All @@ -16,7 +25,9 @@
from dotenv import load_dotenv
from tqdm import tqdm

# NOTE: This file should be run in the `eval_checker` directory
# A dictionary to store the evaluation scores.
# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count
LEADERBOARD_TABLE = {}


def multi_turn_runner(
Expand Down Expand Up @@ -138,7 +149,7 @@ def multi_turn_runner(
},
)
output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
output_file_dir = os.path.join(OUTPUT_PATH, model_name)
output_file_dir = SCORE_PATH / model_name
write_list_of_dicts_to_file(output_file_name, result, output_file_dir)

return accuracy, len(model_result)
Expand Down Expand Up @@ -241,7 +252,7 @@ def executable_file_runner(handler, model_result, prompt, model_name, test_categ
},
)
output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
output_file_dir = os.path.join(OUTPUT_PATH, model_name)
output_file_dir = SCORE_PATH / model_name
write_list_of_dicts_to_file(output_file_name, result, output_file_dir)

return accuracy, len(model_result)
Expand Down Expand Up @@ -313,7 +324,7 @@ def relevance_file_runner(handler, model_result, prompt, model_name, test_catego
},
)
output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
output_file_dir = os.path.join(OUTPUT_PATH, model_name)
output_file_dir = SCORE_PATH / model_name
write_list_of_dicts_to_file(output_file_name, result, output_file_dir)

return accuracy, len(model_result)
Expand Down Expand Up @@ -407,7 +418,7 @@ def ast_file_runner(
},
)
output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
output_file_dir = os.path.join(OUTPUT_PATH, model_name)
output_file_dir = SCORE_PATH / model_name
write_list_of_dicts_to_file(output_file_name, result, output_file_dir)

return accuracy, len(model_result)
Expand All @@ -429,28 +440,24 @@ def runner(model_names, test_categories, api_sanity_check):
EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = []

# Get a list of all entries in the folder
entries = os.scandir(INPUT_PATH)
entries = RESULT_PATH.iterdir()

# Filter out the subdirectories
subdirs = [entry.path for entry in entries if entry.is_dir()]
subdirs = [entry for entry in entries if entry.is_dir()]

# Traverse each subdirectory
for subdir in tqdm(subdirs, desc="Number of models evaluated"):

model_name = subdir.split(INPUT_PATH)[1]
model_name = subdir.relative_to(RESULT_PATH).name
if model_names is not None and model_name not in model_names:
continue

model_name_escaped = model_name.replace("_", "/")

# Pattern to match JSON files in this subdirectory
json_files_pattern = os.path.join(subdir, "*.json")

print(f"🦍 Model: {model_name}")

# Find and process all JSON files in the subdirectory
for model_result_json in glob.glob(json_files_pattern):

for model_result_json in subdir.glob("*.json"):
test_category = extract_test_category(model_result_json)
if test_categories is not None and test_category not in test_categories:
continue
Expand Down Expand Up @@ -571,9 +578,9 @@ def runner(model_names, test_categories, api_sanity_check):

# This function reads all the score files from local folder and updates the leaderboard table.
# This is helpful when you only want to run the evaluation for a subset of models and test categories.
update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH)
update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, SCORE_PATH)
# Write the leaderboard table to a file
generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH, model_names, test_categories)
generate_leaderboard_csv(LEADERBOARD_TABLE, SCORE_PATH, model_names, test_categories)

# Clean up the executable expected output files
# They should be re-generated the next time the evaluation is run
Expand All @@ -584,23 +591,13 @@ def runner(model_names, test_categories, api_sanity_check):
)

print(
f"🏁 Evaluation completed. See {os.path.abspath(OUTPUT_PATH + 'data_overall.csv')} for evaluation results on BFCL V2."
f"🏁 Evaluation completed. See {SCORE_PATH / 'data_overall.csv'} for evaluation results on BFCL V2."
)
print(
f"See {os.path.abspath(OUTPUT_PATH + 'data_live.csv')} and {os.path.abspath(OUTPUT_PATH + 'data_non_live.csv')} for evaluation results on BFCL V2 Live and Non-Live categories respectively."
f"See {SCORE_PATH / 'data_live.csv'} and {SCORE_PATH / 'data_non_live.csv'} for evaluation results on BFCL V2 Live and Non-Live categories respectively."
)


INPUT_PATH = "../../result/"
PROMPT_PATH = "../../data/"
POSSIBLE_ANSWER_PATH = "../../data/possible_answer/"
OUTPUT_PATH = "../../score/"

# A dictionary to store the results
# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count
LEADERBOARD_TABLE = {}


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process two lists of strings.")

Expand Down Expand Up @@ -643,5 +640,5 @@ def runner(model_names, test_categories, api_sanity_check):
# We patch it here to avoid confusing the user.
model_names.append(model_name.replace("/", "_"))

load_dotenv(dotenv_path="../../.env", verbose=True, override=True) # Load the .env file
load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file
runner(model_names, test_categories, api_sanity_check)
Loading