From 47333f5cd077a7b11b029b1853f5d74504d39330 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Sun, 6 Oct 2024 22:31:11 -0700 Subject: [PATCH 1/4] simplify logic for apply funciton credential config --- berkeley-function-call-leaderboard/README.md | 7 +-- .../_apply_function_credential_config.py} | 44 +++++++------------ 2 files changed, 17 insertions(+), 34 deletions(-) rename berkeley-function-call-leaderboard/{apply_function_credential_config.py => bfcl/_apply_function_credential_config.py} (79%) diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index 1e5733c94..79257ca82 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -71,12 +71,7 @@ To run the executable test categories, there are 4 API keys to include: 3. OMDB API: http://www.omdbapi.com/apikey.aspx 4. Geocode API: https://geocode.maps.co/ -The `apply_function_credential_config.py` will automatically search for dataset files in the default `./data/` directory and replace the placeholder values with the actual API keys. -After you have filled in the necessary values in the `.env` file, you can run the following command to apply the real API keys to the dataset files. - -```bash -python apply_function_credential_config.py -``` +The evaluation script will automatically search for dataset files in the default `./data/` directory and replace the placeholder values with the actual API keys you provided in the `.env` file. ## Evaluating different models on the BFCL diff --git a/berkeley-function-call-leaderboard/apply_function_credential_config.py b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py similarity index 79% rename from berkeley-function-call-leaderboard/apply_function_credential_config.py rename to berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py index 6c0e8fe70..bfe220267 100644 --- a/berkeley-function-call-leaderboard/apply_function_credential_config.py +++ b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py @@ -1,17 +1,11 @@ import glob import json -import argparse import os -from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError -from dotenv import load_dotenv -parser = argparse.ArgumentParser(description="Replace placeholders in the function credential config file.") -parser.add_argument("--input-path", help="Path to the function credential config file. Can be a file or a directory.") -parser.add_argument("--output-path", help="Path to the output file.") -args = parser.parse_args() +from bfcl.constant import PROMPT_PATH +from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError -# Load the actual API keys -load_dotenv() +# Load the actual API keys, and verify that they are present ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY") PLACEHOLDERS = {} for var in ENV_VARS: @@ -19,6 +13,7 @@ raise NoAPIKeyError() PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var) +print("All API keys are present.") def replace_placeholders(data): @@ -57,16 +52,16 @@ def process_file(input_file_path, output_file_path): # Handle the case where a line is not a valid JSON object print("Invalid JSON line skipped.") continue - + # Write the modified data to the output file with open(output_file_path, "w") as f: for i, modified_line in enumerate(modified_data): f.write(modified_line) if i < len(modified_data) - 1: # Check against the length of modified_data - f.write("\n") + f.write("\n") print(f"All placeholders have been replaced for {input_file_path} 🦍.") - - + + def process_dir(input_dir, output_dir): # This function does not support nested directories # To support nested directories, refer to this commit: @@ -80,24 +75,17 @@ def process_dir(input_dir, output_dir): file_name = os.path.basename(input_file_path) output_file_path = os.path.join(output_dir, file_name) process_file(input_file_path, output_file_path) - - -if __name__ == "__main__": - # Verify all values are provided - for key, value in PLACEHOLDERS.items(): - if value == "": - raise NoAPIKeyError() - print("All API keys are present.") - - input_path = args.input_path + + +def apply_function_credential_config(input_path=None, output_path=None): + if input_path is None: - input_path = "./data/" - - output_path = args.output_path + input_path = PROMPT_PATH + if output_path is None: output_path = input_path - + if os.path.isdir(input_path): process_dir(input_path, output_path) else: - process_file(input_path, output_path) \ No newline at end of file + process_file(input_path, output_path) From 733cdcd18b461baf73621aa5ead7630ec49328fe Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Sun, 6 Oct 2024 22:31:48 -0700 Subject: [PATCH 2/4] fix all hard-coded relative file path --- .../bfcl/constant.py | 19 ++++++ .../bfcl/eval_checker/constant.py | 12 ++++ .../bfcl/eval_checker/eval_runner.py | 59 +++++++++++-------- .../bfcl/eval_checker/eval_runner_helper.py | 17 ++---- .../executable_eval/executable_checker.py | 19 +++--- .../bfcl/model_handler/base_handler.py | 10 ++-- .../openfunctions_evaluation.py | 35 ++++++----- .../pyproject.toml | 1 + 8 files changed, 108 insertions(+), 64 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py index 3e1022c60..f5ebe52f1 100644 --- a/berkeley-function-call-leaderboard/bfcl/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/constant.py @@ -1,5 +1,15 @@ +from pathlib import Path + +# NOTE: These paths are relative to the `bfcl` directory where this script is located. +RESULT_PATH = "../result/" +PROMPT_PATH = "../data/" +POSSIBLE_ANSWER_PATH = "../data/possible_answer/" +SCORE_PATH = "../score/" +DOTENV_PATH = "../.env" + VERSION_PREFIX = "BFCL_v3" +# These are in the PROMPT_PATH TEST_FILE_MAPPING = { "exec_simple": f"{VERSION_PREFIX}_exec_simple.json", "exec_parallel": f"{VERSION_PREFIX}_exec_parallel.json", @@ -164,3 +174,12 @@ "live_relevance", ], } + + +# Construct the full path to use by other scripts +script_dir = Path(__file__).parent +RESULT_PATH = script_dir / RESULT_PATH +PROMPT_PATH = script_dir / PROMPT_PATH +POSSIBLE_ANSWER_PATH = script_dir / POSSIBLE_ANSWER_PATH +SCORE_PATH = script_dir / SCORE_PATH +DOTENV_PATH = script_dir / DOTENV_PATH diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py index ac3c2ef55..c9e18b392 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py @@ -1,5 +1,8 @@ +from pathlib import Path + REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2 +# These two files are for the API status sanity check REST_API_GROUND_TRUTH_FILE_PATH = ( "./executable_eval/data/api_status_check_ground_truth_REST.json" ) @@ -7,6 +10,9 @@ "./executable_eval/data/api_status_check_ground_truth_executable.json" ) +# This is the ground truth file for the `rest` test category +REST_EVAL_GROUND_TRUTH_PATH = "./executable_eval/data/rest-eval-response_v5.jsonl" + COLUMNS_NON_LIVE = [ "Rank", "Model", @@ -99,3 +105,9 @@ RED_FONT = "\033[91m" RESET = "\033[0m" + +# Construct the full path for other modules to use +script_dir = Path(__file__).parent +REST_API_GROUND_TRUTH_FILE_PATH = script_dir / REST_API_GROUND_TRUTH_FILE_PATH +EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = script_dir / EXECTUABLE_API_GROUND_TRUTH_FILE_PATH +REST_EVAL_GROUND_TRUTH_PATH = script_dir / REST_EVAL_GROUND_TRUTH_PATH diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py index 1427f13bb..24ce8fc91 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -1,6 +1,16 @@ import argparse -from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, VERSION_PREFIX +from bfcl._apply_function_credential_config import apply_function_credential_config +from bfcl.constant import ( + DOTENV_PATH, + POSSIBLE_ANSWER_PATH, + PROMPT_PATH, + RESULT_PATH, + SCORE_PATH, + TEST_COLLECTION_MAPPING, + TEST_FILE_MAPPING, + VERSION_PREFIX, +) from bfcl.eval_checker.ast_eval.ast_checker import ast_checker from bfcl.eval_checker.eval_runner_helper import * from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError @@ -16,7 +26,9 @@ from dotenv import load_dotenv from tqdm import tqdm -# NOTE: This file should be run in the `eval_checker` directory +# A dictionary to store the evaluation scores. +# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count +LEADERBOARD_TABLE = {} def multi_turn_runner( @@ -124,7 +136,7 @@ def multi_turn_runner( }, ) output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) + output_file_dir = SCORE_PATH / model_name write_list_of_dicts_to_file(output_file_name, result, output_file_dir) return accuracy, len(model_result) @@ -227,7 +239,7 @@ def executable_file_runner(handler, model_result, prompt, model_name, test_categ }, ) output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) + output_file_dir = SCORE_PATH / model_name write_list_of_dicts_to_file(output_file_name, result, output_file_dir) return accuracy, len(model_result) @@ -299,7 +311,7 @@ def relevance_file_runner(handler, model_result, prompt, model_name, test_catego }, ) output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) + output_file_dir = SCORE_PATH / model_name write_list_of_dicts_to_file(output_file_name, result, output_file_dir) return accuracy, len(model_result) @@ -393,12 +405,18 @@ def ast_file_runner( }, ) output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) + output_file_dir = SCORE_PATH / model_name write_list_of_dicts_to_file(output_file_name, result, output_file_dir) return accuracy, len(model_result) +if not RESULT_PATH.exists(): + RESULT_PATH.mkdir(parents=True, exist_ok=True) +if not SCORE_PATH.exists(): + SCORE_PATH.mkdir(parents=True, exist_ok=True) + + #### Main runner function #### def runner(model_names, test_categories, api_sanity_check): @@ -408,6 +426,7 @@ def runner(model_names, test_categories, api_sanity_check): API_TESTED = False API_STATUS_ERROR_REST = None API_STATUS_ERROR_EXECUTABLE = None + HAS_REPLACED_API_CREDENTIALS = False # Before running the executable evaluation, we need to get the expected output from the ground truth. # So we need a list of all the test categories that we have ran the ground truth evaluation on. @@ -415,7 +434,7 @@ def runner(model_names, test_categories, api_sanity_check): EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = [] # Get a list of all entries in the folder - entries = os.scandir(INPUT_PATH) + entries = os.scandir(RESULT_PATH) # Filter out the subdirectories subdirs = [entry.path for entry in entries if entry.is_dir()] @@ -423,7 +442,7 @@ def runner(model_names, test_categories, api_sanity_check): # Traverse each subdirectory for subdir in subdirs: - model_name = subdir.split(INPUT_PATH)[1] + model_name = subdir.split(RESULT_PATH)[1] if model_names is not None and model_name not in model_names: continue @@ -495,6 +514,10 @@ def runner(model_names, test_categories, api_sanity_check): API_TESTED = True + if not HAS_REPLACED_API_CREDENTIALS: + apply_function_credential_config(input_path=PROMPT_PATH) + HAS_REPLACED_API_CREDENTIALS = True + if ( test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN and not is_rest(test_category) @@ -557,9 +580,9 @@ def runner(model_names, test_categories, api_sanity_check): # This function reads all the score files from local folder and updates the leaderboard table. # This is helpful when you only want to run the evaluation for a subset of models and test categories. - update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH) + update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, SCORE_PATH) # Write the leaderboard table to a file - generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH, model_names, test_categories) + generate_leaderboard_csv(LEADERBOARD_TABLE, SCORE_PATH, model_names, test_categories) # Clean up the executable expected output files # They should be re-generated the next time the evaluation is run @@ -570,23 +593,13 @@ def runner(model_names, test_categories, api_sanity_check): ) print( - f"🏁 Evaluation completed. See {os.path.abspath(OUTPUT_PATH + 'data_overall.csv')} for evaluation results on BFCL V2." + f"🏁 Evaluation completed. See {SCORE_PATH / 'data_overall.csv'} for evaluation results on BFCL V2." ) print( - f"See {os.path.abspath(OUTPUT_PATH + 'data_live.csv')} and {os.path.abspath(OUTPUT_PATH + 'data_non_live.csv')} for evaluation results on BFCL V2 Live and Non-Live categories respectively." + f"See {SCORE_PATH / 'data_live.csv'} and {SCORE_PATH / 'data_non_live.csv'} for evaluation results on BFCL V2 Live and Non-Live categories respectively." ) -INPUT_PATH = "../../result/" -PROMPT_PATH = "../../data/" -POSSIBLE_ANSWER_PATH = "../../data/possible_answer/" -OUTPUT_PATH = "../../score/" - -# A dictionary to store the results -# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count -LEADERBOARD_TABLE = {} - - if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process two lists of strings.") @@ -629,5 +642,5 @@ def runner(model_names, test_categories, api_sanity_check): # We patch it here to avoid confusing the user. model_names.append(model_name.replace("/", "_")) - load_dotenv(dotenv_path="../../.env", verbose=True, override=True) # Load the .env file + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file runner(model_names, test_categories, api_sanity_check) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index e6988a2ce..7ac4c7538 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -3,13 +3,13 @@ import os import re import statistics -import subprocess import numpy as np +from bfcl._apply_function_credential_config import apply_function_credential_config +from bfcl.constant import VERSION_PREFIX from bfcl.eval_checker.constant import * from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError from bfcl.eval_checker.model_metadata import * -from bfcl.constant import VERSION_PREFIX from bfcl.model_handler.handler_map import handler_map from tqdm import tqdm @@ -148,12 +148,7 @@ def api_status_sanity_check_rest(): ground_truth_dummy = load_file(REST_API_GROUND_TRUTH_FILE_PATH) # Use the ground truth data to make sure the API is working correctly - command = f"cd ../.. ; python apply_function_credential_config.py --input-path ./bfcl/eval_checker/{REST_API_GROUND_TRUTH_FILE_PATH};" - try: - subprocess.run(command, shell=True, capture_output=True, text=True, check=True) - except subprocess.CalledProcessError as e: - write_list_of_dicts_to_file(REST_API_GROUND_TRUTH_FILE_PATH, ground_truth_dummy) - raise RuntimeError(e.stderr) from e + apply_function_credential_config(input_path=REST_API_GROUND_TRUTH_FILE_PATH) ground_truth_replaced = load_file(REST_API_GROUND_TRUTH_FILE_PATH) write_list_of_dicts_to_file(REST_API_GROUND_TRUTH_FILE_PATH, ground_truth_dummy) @@ -620,7 +615,7 @@ def generate_leaderboard_csv( data_non_live.insert(0, COLUMNS_NON_LIVE) - filepath = os.path.join(output_path, "data_non_live.csv") + filepath = output_path / "data_non_live.csv" with open(filepath, "w") as f: for i, row in enumerate(data_non_live): if i < len(data_non_live) - 1: @@ -637,7 +632,7 @@ def generate_leaderboard_csv( data_live.insert(0, COLUMNS_LIVE) - filepath = os.path.join(output_path, "data_live.csv") + filepath = output_path / "data_live.csv" with open(filepath, "w") as f: for i, row in enumerate(data_live): if i < len(data_live) - 1: @@ -660,7 +655,7 @@ def generate_leaderboard_csv( data_combined.insert(0, COLUMNS_OVERALL) - filepath = os.path.join(output_path, "data_overall.csv") + filepath = output_path / "data_overall.csv" with open(filepath, "w") as f: for i, row in enumerate(data_combined): if i < len(data_combined) - 1: diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py index 922d041d4..c178dcefa 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py @@ -1,16 +1,15 @@ -from bfcl.eval_checker.constant import REAL_TIME_MATCH_ALLOWED_DIFFERENCE -from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError - -import requests # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function. -import time import json +import time - -#### Constants #### -EVAL_GROUND_TRUTH_PATH = ( - "./executable_eval/data/rest-eval-response_v5.jsonl" # Ground truth file for v5 for rest execution +import requests # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function. +from bfcl.eval_checker.constant import ( + REAL_TIME_MATCH_ALLOWED_DIFFERENCE, + REST_EVAL_GROUND_TRUTH_PATH, ) -with open(EVAL_GROUND_TRUTH_PATH, "r") as f: +from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError + +# Load the ground truth data for the `rest` test category +with open(REST_EVAL_GROUND_TRUTH_PATH, "r") as f: EVAL_GROUND_TRUTH = f.readlines() diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py index 421e2e1cb..1e1106709 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py @@ -1,17 +1,16 @@ import json -import os import time +from bfcl.constant import RESULT_PATH, VERSION_PREFIX from bfcl.eval_checker.multi_turn_eval.multi_turn_utils import ( execute_multi_turn_func_call, is_empty_execute_response, ) from bfcl.model_handler.constant import ( - MAXIMUM_ROUND_LIMIT, DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC, DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING, + MAXIMUM_ROUND_LIMIT, ) -from bfcl.constant import VERSION_PREFIX from bfcl.model_handler.model_style import ModelStyle @@ -480,7 +479,8 @@ def decode_execute(self, result): def write(self, result): model_name_dir = self.model_name.replace("/", "_") - os.makedirs(f"./result/{model_name_dir}", exist_ok=True) + model_result_dir = RESULT_PATH / model_name_dir + model_name_dir.mkdir(parents=True, exist_ok=True) if type(result) is dict: result = [result] @@ -488,7 +488,7 @@ def write(self, result): for entry in result: test_category = entry["id"].rsplit("_", 1)[0] file_to_write = f"{VERSION_PREFIX}_{test_category}_result.json" - file_to_write = f"./result/{model_name_dir}/{file_to_write}" + file_to_write = model_result_dir / file_to_write with open(file_to_write, "a+") as f: try: f.write(json.dumps(entry) + "\n") diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py index a7b929aa4..f830ce995 100644 --- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py +++ b/berkeley-function-call-leaderboard/openfunctions_evaluation.py @@ -5,7 +5,15 @@ import time from concurrent.futures import ThreadPoolExecutor -from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING +from bfcl._apply_function_credential_config import apply_function_credential_config +from bfcl.constant import ( + DOTENV_PATH, + PROMPT_PATH, + RESULT_PATH, + TEST_COLLECTION_MAPPING, + TEST_FILE_MAPPING, +) +from bfcl.eval_checker.eval_runner_helper import is_executable from bfcl.model_handler.handler_map import handler_map from bfcl.model_handler.model_style import ModelStyle from dotenv import load_dotenv @@ -82,26 +90,19 @@ def parse_test_category_argument(test_category_args): def collect_test_cases(test_filename_total, model_name): model_name_dir = model_name.replace("/", "_") + model_result_dir = RESULT_PATH / model_name_dir + test_cases_total = [] for file_to_open in test_filename_total: test_cases = [] - with open("./data/" + file_to_open) as f: + with open(PROMPT_PATH / file_to_open) as f: for line in f: test_cases.append(json.loads(line)) existing_result = [] - if os.path.exists( - "./result/" - + model_name_dir - + "/" - + file_to_open.replace(".json", "_result.json") - ): - with open( - "./result/" - + model_name_dir - + "/" - + file_to_open.replace(".json", "_result.json") - ) as f: + result_file_path = model_result_dir / file_to_open.replace(".json", "_result.json") + if result_file_path.exists(): + with open(result_file_path) as f: for line in f: existing_result.append(json.loads(line)) @@ -194,7 +195,7 @@ def generate_results(args, model_name, test_cases_total): if __name__ == "__main__": - load_dotenv(dotenv_path="./.env", verbose=True, override=True) # Load the .env file + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file args = get_args() @@ -207,6 +208,10 @@ def generate_results(args, model_name, test_cases_total): print(f"Generating results for {args.model} on test category: {test_name_total}.") + # Apply function credential config if any of the test categories are executable + if any([is_executable(category) for category in test_name_total]): + apply_function_credential_config(input_path=PROMPT_PATH) + for model_name in args.model: if ( os.getenv("USE_COHERE_OPTIMIZATION") == "True" diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index b51cea2fa..1a6dbdc87 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "anthropic==0.31.1", "cohere==5.5.8", "google-cloud-aiplatform==1.65.0", + "pathlib", ] [tool.setuptools.packages.find] From 82859eb8e2fdca17c3f249791d4b888f991bee03 Mon Sep 17 00:00:00 2001 From: Huanzhi Mao Date: Mon, 7 Oct 2024 17:52:29 -0700 Subject: [PATCH 3/4] clean up --- .../bfcl/_apply_function_credential_config.py | 16 ++++++------- .../bfcl/constant.py | 13 ++++++---- .../bfcl/eval_checker/constant.py | 6 ++--- .../bfcl/eval_checker/eval_runner.py | 24 ++++--------------- .../bfcl/eval_checker/eval_runner_helper.py | 23 +++++++++--------- .../bfcl/model_handler/base_handler.py | 2 +- 6 files changed, 34 insertions(+), 50 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py index bfe220267..1a5059937 100644 --- a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py +++ b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py @@ -5,15 +5,8 @@ from bfcl.constant import PROMPT_PATH from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError -# Load the actual API keys, and verify that they are present ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY") PLACEHOLDERS = {} -for var in ENV_VARS: - if os.getenv(var) == "": - raise NoAPIKeyError() - - PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var) -print("All API keys are present.") def replace_placeholders(data): @@ -59,14 +52,13 @@ def process_file(input_file_path, output_file_path): f.write(modified_line) if i < len(modified_data) - 1: # Check against the length of modified_data f.write("\n") - print(f"All placeholders have been replaced for {input_file_path} 🦍.") def process_dir(input_dir, output_dir): # This function does not support nested directories # To support nested directories, refer to this commit: # https://github.com/ShishirPatil/gorilla/pull/508/commits/8b1e35590e5bce3bd52a7c6405775b1ce4a64945 - print(f"Input directory: {input_dir}") + # Get a list of all entries in the folder entries = os.scandir(input_dir) @@ -78,6 +70,11 @@ def process_dir(input_dir, output_dir): def apply_function_credential_config(input_path=None, output_path=None): + # Load the actual API keys, and verify that they are present + for var in ENV_VARS: + if os.getenv(var) == "": + raise NoAPIKeyError() + PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var) if input_path is None: input_path = PROMPT_PATH @@ -89,3 +86,4 @@ def apply_function_credential_config(input_path=None, output_path=None): process_dir(input_path, output_path) else: process_file(input_path, output_path) + print("All placeholders API keys have been replaced. 🦍") diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py index f5ebe52f1..dbeaab0ec 100644 --- a/berkeley-function-call-leaderboard/bfcl/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/constant.py @@ -178,8 +178,11 @@ # Construct the full path to use by other scripts script_dir = Path(__file__).parent -RESULT_PATH = script_dir / RESULT_PATH -PROMPT_PATH = script_dir / PROMPT_PATH -POSSIBLE_ANSWER_PATH = script_dir / POSSIBLE_ANSWER_PATH -SCORE_PATH = script_dir / SCORE_PATH -DOTENV_PATH = script_dir / DOTENV_PATH +RESULT_PATH = (script_dir / RESULT_PATH).resolve() +PROMPT_PATH = (script_dir / PROMPT_PATH).resolve() +POSSIBLE_ANSWER_PATH = (script_dir / POSSIBLE_ANSWER_PATH).resolve() +SCORE_PATH = (script_dir / SCORE_PATH).resolve() +DOTENV_PATH = (script_dir / DOTENV_PATH).resolve() + +RESULT_PATH.mkdir(parents=True, exist_ok=True) +SCORE_PATH.mkdir(parents=True, exist_ok=True) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py index c9e18b392..f5d6524fd 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py @@ -108,6 +108,6 @@ # Construct the full path for other modules to use script_dir = Path(__file__).parent -REST_API_GROUND_TRUTH_FILE_PATH = script_dir / REST_API_GROUND_TRUTH_FILE_PATH -EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = script_dir / EXECTUABLE_API_GROUND_TRUTH_FILE_PATH -REST_EVAL_GROUND_TRUTH_PATH = script_dir / REST_EVAL_GROUND_TRUTH_PATH +REST_API_GROUND_TRUTH_FILE_PATH = (script_dir / REST_API_GROUND_TRUTH_FILE_PATH).resolve() +EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = (script_dir / EXECTUABLE_API_GROUND_TRUTH_FILE_PATH).resolve() +REST_EVAL_GROUND_TRUTH_PATH = (script_dir / REST_EVAL_GROUND_TRUTH_PATH).resolve() diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py index 24ce8fc91..8001fdfa6 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -1,6 +1,5 @@ import argparse -from bfcl._apply_function_credential_config import apply_function_credential_config from bfcl.constant import ( DOTENV_PATH, POSSIBLE_ANSWER_PATH, @@ -411,12 +410,6 @@ def ast_file_runner( return accuracy, len(model_result) -if not RESULT_PATH.exists(): - RESULT_PATH.mkdir(parents=True, exist_ok=True) -if not SCORE_PATH.exists(): - SCORE_PATH.mkdir(parents=True, exist_ok=True) - - #### Main runner function #### def runner(model_names, test_categories, api_sanity_check): @@ -426,7 +419,6 @@ def runner(model_names, test_categories, api_sanity_check): API_TESTED = False API_STATUS_ERROR_REST = None API_STATUS_ERROR_EXECUTABLE = None - HAS_REPLACED_API_CREDENTIALS = False # Before running the executable evaluation, we need to get the expected output from the ground truth. # So we need a list of all the test categories that we have ran the ground truth evaluation on. @@ -434,28 +426,24 @@ def runner(model_names, test_categories, api_sanity_check): EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = [] # Get a list of all entries in the folder - entries = os.scandir(RESULT_PATH) + entries = RESULT_PATH.iterdir() # Filter out the subdirectories - subdirs = [entry.path for entry in entries if entry.is_dir()] + subdirs = [entry for entry in entries if entry.is_dir()] # Traverse each subdirectory for subdir in subdirs: - model_name = subdir.split(RESULT_PATH)[1] + model_name = subdir.relative_to(RESULT_PATH).name if model_names is not None and model_name not in model_names: continue model_name_escaped = model_name.replace("_", "/") - # Pattern to match JSON files in this subdirectory - json_files_pattern = os.path.join(subdir, "*.json") - print(f"🦍 Model: {model_name}") # Find and process all JSON files in the subdirectory - for model_result_json in glob.glob(json_files_pattern): - + for model_result_json in subdir.glob("*.json"): test_category = extract_test_category(model_result_json) if test_categories is not None and test_category not in test_categories: continue @@ -514,10 +502,6 @@ def runner(model_names, test_categories, api_sanity_check): API_TESTED = True - if not HAS_REPLACED_API_CREDENTIALS: - apply_function_credential_config(input_path=PROMPT_PATH) - HAS_REPLACED_API_CREDENTIALS = True - if ( test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN and not is_rest(test_category) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index 7ac4c7538..77cde32f0 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -1,8 +1,9 @@ -import glob import json import os import re import statistics +from pathlib import Path +from typing import Union import numpy as np from bfcl._apply_function_credential_config import apply_function_credential_config @@ -14,7 +15,8 @@ from tqdm import tqdm -def extract_test_category(input_string): +def extract_test_category(input_string: Union[str, Path]) -> str: + input_string = str(input_string) pattern = fr".*{VERSION_PREFIX}_(\w+?)(?:_score|_result)?\.json" match = re.search(pattern, input_string) @@ -25,9 +27,8 @@ def extract_test_category(input_string): raise ValueError(f"Could not extract the test category from the input string: {input_string}") -def find_file_with_suffix(folder_path, suffix): - json_files_pattern = os.path.join(folder_path, "*.json") - for json_file in glob.glob(json_files_pattern): +def find_file_with_suffix(folder_path: Path, suffix: str) -> Path: + for json_file in folder_path.glob("*.json"): if extract_test_category(json_file) == suffix: return json_file raise FileNotFoundError(f"No JSON file found with suffix: {suffix}") @@ -789,20 +790,18 @@ def check_all_category_present(category_status, eval_models=None, eval_categorie return found_issues -def update_leaderboard_table_with_score_file(leaderboard_table, score_path): +def update_leaderboard_table_with_score_file(leaderboard_table, score_path: Path) -> None: - entries = os.scandir(score_path) + entries = score_path.iterdir() # Filter out the subdirectories - subdirs = [entry.path for entry in entries if entry.is_dir()] + subdirs = [entry for entry in entries if entry.is_dir()] # Traverse each subdirectory for subdir in subdirs: - # Pattern to match JSON files in this subdirectory - json_files_pattern = os.path.join(subdir, "*.json") - model_name = subdir.split(score_path)[1] + model_name = subdir.relative_to(score_path).name # Find and process all JSON files in the subdirectory - for model_score_json in glob.glob(json_files_pattern): + for model_score_json in subdir.glob("*.json"): metadata = load_file(model_score_json)[0] accuracy, total_count = metadata["accuracy"], metadata["total_count"] test_category = extract_test_category(model_score_json) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py index 1e1106709..7a19c26c0 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py @@ -480,7 +480,7 @@ def decode_execute(self, result): def write(self, result): model_name_dir = self.model_name.replace("/", "_") model_result_dir = RESULT_PATH / model_name_dir - model_name_dir.mkdir(parents=True, exist_ok=True) + model_result_dir.mkdir(parents=True, exist_ok=True) if type(result) is dict: result = [result] From 71769b116431f530f4226e6c05b7e91d485497b4 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Mon, 7 Oct 2024 21:43:27 -0700 Subject: [PATCH 4/4] better error message --- .../bfcl/_apply_function_credential_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py index 1a5059937..0cc282144 100644 --- a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py +++ b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py @@ -72,7 +72,7 @@ def process_dir(input_dir, output_dir): def apply_function_credential_config(input_path=None, output_path=None): # Load the actual API keys, and verify that they are present for var in ENV_VARS: - if os.getenv(var) == "": + if var not in os.environ or not os.getenv(var): raise NoAPIKeyError() PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)