ShishirPatil · HuanzhiMao · Oct 9, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 8, 2024
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -71,12 +71,7 @@ To run the executable test categories, there are 4 API keys to include:
 3. OMDB API: http://www.omdbapi.com/apikey.aspx
 4. Geocode API: https://geocode.maps.co/
 
-The `apply_function_credential_config.py` will automatically search for dataset files in the default `./data/` directory and replace the placeholder values with the actual API keys.
-After you have filled in the necessary values in the `.env` file, you can run the following command to apply the real API keys to the dataset files.
-
-```bash
-python apply_function_credential_config.py
-```
+The evaluation script will automatically search for dataset files in the default `./data/` directory and replace the placeholder values with the actual API keys you provided in the `.env` file.
 
 ## Evaluating different models on the BFCL
 

diff --git a/...board/apply_function_credential_config.py → ...bfcl/_apply_function_credential_config.py b/...board/apply_function_credential_config.py → ...bfcl/_apply_function_credential_config.py
@@ -1,24 +1,12 @@
 import glob
 import json
-import argparse
 import os
-from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError
-from dotenv import load_dotenv
 
-parser = argparse.ArgumentParser(description="Replace placeholders in the function credential config file.")
-parser.add_argument("--input-path", help="Path to the function credential config file. Can be a file or a directory.")
-parser.add_argument("--output-path", help="Path to the output file.")
-args = parser.parse_args()
+from bfcl.constant import PROMPT_PATH
+from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError
 
-# Load the actual API keys
-load_dotenv()
 ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY")
 PLACEHOLDERS = {}
-for var in ENV_VARS:
-    if os.getenv(var) == "":
-        raise NoAPIKeyError()
-
-    PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)
 
 
 def replace_placeholders(data):
@@ -57,21 +45,20 @@ def process_file(input_file_path, output_file_path):
                 # Handle the case where a line is not a valid JSON object
                 print("Invalid JSON line skipped.")
                 continue
-    
+
     # Write the modified data to the output file
     with open(output_file_path, "w") as f:
         for i, modified_line in enumerate(modified_data):
             f.write(modified_line)
             if i < len(modified_data) - 1:  # Check against the length of modified_data
-                f.write("\n")        
-    print(f"All placeholders have been replaced for {input_file_path} 🦍.")
-
-
+                f.write("\n")
+
+
 def process_dir(input_dir, output_dir):
     # This function does not support nested directories
     # To support nested directories, refer to this commit:
     # https://github.com/ShishirPatil/gorilla/pull/508/commits/8b1e35590e5bce3bd52a7c6405775b1ce4a64945
-    print(f"Input directory: {input_dir}")
+
     # Get a list of all entries in the folder
     entries = os.scandir(input_dir)
 
@@ -80,24 +67,23 @@ def process_dir(input_dir, output_dir):
         file_name = os.path.basename(input_file_path)
         output_file_path = os.path.join(output_dir, file_name)
         process_file(input_file_path, output_file_path)
-
 
-if __name__ == "__main__":
-    # Verify all values are provided
-    for key, value in PLACEHOLDERS.items():
-        if value == "":
+
+def apply_function_credential_config(input_path=None, output_path=None):
+    # Load the actual API keys, and verify that they are present
+    for var in ENV_VARS:
+        if var not in os.environ or not os.getenv(var):
             raise NoAPIKeyError()
-    print("All API keys are present.")
-
-    input_path = args.input_path
+        PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)
+
     if input_path is None:
-        input_path = "./data/" 
-
-    output_path = args.output_path
+        input_path = PROMPT_PATH
+
     if output_path is None:
         output_path = input_path
-    
+
     if os.path.isdir(input_path):
         process_dir(input_path, output_path)
     else:
-        process_file(input_path, output_path)
+        process_file(input_path, output_path)
+    print("All placeholders API keys have been replaced. 🦍")
diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py
@@ -1,5 +1,15 @@
+from pathlib import Path
+
+# NOTE: These paths are relative to the `bfcl` directory where this script is located.
+RESULT_PATH = "../result/"
+PROMPT_PATH = "../data/"
+POSSIBLE_ANSWER_PATH = "../data/possible_answer/"
+SCORE_PATH = "../score/"
+DOTENV_PATH = "../.env"
+
 VERSION_PREFIX = "BFCL_v3"
 
+# These are in the PROMPT_PATH
 TEST_FILE_MAPPING = {
     "exec_simple": f"{VERSION_PREFIX}_exec_simple.json",
     "exec_parallel": f"{VERSION_PREFIX}_exec_parallel.json",
@@ -164,3 +174,15 @@
         "live_relevance",
     ],
 }
+
+
+# Construct the full path to use by other scripts
+script_dir = Path(__file__).parent
+RESULT_PATH = (script_dir / RESULT_PATH).resolve()
+PROMPT_PATH = (script_dir / PROMPT_PATH).resolve()
+POSSIBLE_ANSWER_PATH = (script_dir / POSSIBLE_ANSWER_PATH).resolve()
+SCORE_PATH = (script_dir / SCORE_PATH).resolve()
+DOTENV_PATH = (script_dir / DOTENV_PATH).resolve()
+
+RESULT_PATH.mkdir(parents=True, exist_ok=True)
+SCORE_PATH.mkdir(parents=True, exist_ok=True)
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
@@ -1,12 +1,18 @@
+from pathlib import Path
+
 REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
 
+# These two files are for the API status sanity check
 REST_API_GROUND_TRUTH_FILE_PATH = (
     "./executable_eval/data/api_status_check_ground_truth_REST.json"
 )
 EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = (
     "./executable_eval/data/api_status_check_ground_truth_executable.json"
 )
 
+# This is the ground truth file for the `rest` test category
+REST_EVAL_GROUND_TRUTH_PATH = "./executable_eval/data/rest-eval-response_v5.jsonl"
+
 COLUMNS_NON_LIVE = [
     "Rank",
     "Model",
@@ -99,3 +105,9 @@
 
 RED_FONT = "\033[91m"
 RESET = "\033[0m"
+
+# Construct the full path for other modules to use
+script_dir = Path(__file__).parent
+REST_API_GROUND_TRUTH_FILE_PATH = (script_dir / REST_API_GROUND_TRUTH_FILE_PATH).resolve()
+EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = (script_dir / EXECTUABLE_API_GROUND_TRUTH_FILE_PATH).resolve()
+REST_EVAL_GROUND_TRUTH_PATH = (script_dir / REST_EVAL_GROUND_TRUTH_PATH).resolve()
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -1,6 +1,15 @@
 import argparse
 
-from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, VERSION_PREFIX
+from bfcl.constant import (
+    DOTENV_PATH,
+    POSSIBLE_ANSWER_PATH,
+    PROMPT_PATH,
+    RESULT_PATH,
+    SCORE_PATH,
+    TEST_COLLECTION_MAPPING,
+    TEST_FILE_MAPPING,
+    VERSION_PREFIX,
+)
 from bfcl.eval_checker.ast_eval.ast_checker import ast_checker
 from bfcl.eval_checker.eval_runner_helper import *
 from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError
@@ -16,7 +25,9 @@
 from dotenv import load_dotenv
 from tqdm import tqdm
 
-# NOTE: This file should be run in the `eval_checker` directory
+# A dictionary to store the evaluation scores.
+# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count
+LEADERBOARD_TABLE = {}
 
 
 def multi_turn_runner(
@@ -138,7 +149,7 @@ def multi_turn_runner(
         },
     )
     output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    output_file_dir = SCORE_PATH / model_name
     write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
 
     return accuracy, len(model_result)
@@ -241,7 +252,7 @@ def executable_file_runner(handler, model_result, prompt, model_name, test_categ
         },
     )
     output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    output_file_dir = SCORE_PATH / model_name
     write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
 
     return accuracy, len(model_result)
@@ -313,7 +324,7 @@ def relevance_file_runner(handler, model_result, prompt, model_name, test_catego
         },
     )
     output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    output_file_dir = SCORE_PATH / model_name
     write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
 
     return accuracy, len(model_result)
@@ -407,7 +418,7 @@ def ast_file_runner(
         },
     )
     output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    output_file_dir = SCORE_PATH / model_name
     write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
 
     return accuracy, len(model_result)
@@ -429,28 +440,24 @@ def runner(model_names, test_categories, api_sanity_check):
     EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = []
 
     # Get a list of all entries in the folder
-    entries = os.scandir(INPUT_PATH)
+    entries = RESULT_PATH.iterdir()
 
     # Filter out the subdirectories
-    subdirs = [entry.path for entry in entries if entry.is_dir()]
+    subdirs = [entry for entry in entries if entry.is_dir()]
 
     # Traverse each subdirectory
     for subdir in tqdm(subdirs, desc="Number of models evaluated"):
 
-        model_name = subdir.split(INPUT_PATH)[1]
+        model_name = subdir.relative_to(RESULT_PATH).name
         if model_names is not None and model_name not in model_names:
             continue
 
         model_name_escaped = model_name.replace("_", "/")
 
-        # Pattern to match JSON files in this subdirectory
-        json_files_pattern = os.path.join(subdir, "*.json")
-
         print(f"🦍 Model: {model_name}")
 
         # Find and process all JSON files in the subdirectory
-        for model_result_json in glob.glob(json_files_pattern):
-
+        for model_result_json in subdir.glob("*.json"):
             test_category = extract_test_category(model_result_json)
             if test_categories is not None and test_category not in test_categories:
                 continue
@@ -571,9 +578,9 @@ def runner(model_names, test_categories, api_sanity_check):
 
     # This function reads all the score files from local folder and updates the leaderboard table.
     # This is helpful when you only want to run the evaluation for a subset of models and test categories.
-    update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH)
+    update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, SCORE_PATH)
     # Write the leaderboard table to a file
-    generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH, model_names, test_categories)
+    generate_leaderboard_csv(LEADERBOARD_TABLE, SCORE_PATH, model_names, test_categories)
 
     # Clean up the executable expected output files
     # They should be re-generated the next time the evaluation is run
@@ -584,23 +591,13 @@ def runner(model_names, test_categories, api_sanity_check):
     )
 
     print(
-        f"🏁 Evaluation completed. See {os.path.abspath(OUTPUT_PATH + 'data_overall.csv')} for evaluation results on BFCL V2."
+        f"🏁 Evaluation completed. See {SCORE_PATH / 'data_overall.csv'} for evaluation results on BFCL V2."
     )
     print(
-        f"See {os.path.abspath(OUTPUT_PATH + 'data_live.csv')} and {os.path.abspath(OUTPUT_PATH + 'data_non_live.csv')} for evaluation results on BFCL V2 Live and Non-Live categories respectively."
+        f"See {SCORE_PATH / 'data_live.csv'} and {SCORE_PATH / 'data_non_live.csv'} for evaluation results on BFCL V2 Live and Non-Live categories respectively."
     )
 
 
-INPUT_PATH = "../../result/"
-PROMPT_PATH = "../../data/"
-POSSIBLE_ANSWER_PATH = "../../data/possible_answer/"
-OUTPUT_PATH = "../../score/"
-
-# A dictionary to store the results
-# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count
-LEADERBOARD_TABLE = {}
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Process two lists of strings.")
 
@@ -643,5 +640,5 @@ def runner(model_names, test_categories, api_sanity_check):
             # We patch it here to avoid confusing the user.
             model_names.append(model_name.replace("/", "_"))
 
-    load_dotenv(dotenv_path="../../.env", verbose=True, override=True)  # Load the .env file
+    load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True)  # Load the .env file
     runner(model_names, test_categories, api_sanity_check)