From 47333f5cd077a7b11b029b1853f5d74504d39330 Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com>
Date: Sun, 6 Oct 2024 22:31:11 -0700
Subject: [PATCH 1/4] simplify logic for apply funciton credential config

---
 berkeley-function-call-leaderboard/README.md  |  7 +--
 .../_apply_function_credential_config.py}     | 44 +++++++------------
 2 files changed, 17 insertions(+), 34 deletions(-)
 rename berkeley-function-call-leaderboard/{apply_function_credential_config.py => bfcl/_apply_function_credential_config.py} (79%)

diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
index 1e5733c94..79257ca82 100644
--- a/berkeley-function-call-leaderboard/README.md
+++ b/berkeley-function-call-leaderboard/README.md
@@ -71,12 +71,7 @@ To run the executable test categories, there are 4 API keys to include:
 3. OMDB API: http://www.omdbapi.com/apikey.aspx
 4. Geocode API: https://geocode.maps.co/
 
-The `apply_function_credential_config.py` will automatically search for dataset files in the default `./data/` directory and replace the placeholder values with the actual API keys.
-After you have filled in the necessary values in the `.env` file, you can run the following command to apply the real API keys to the dataset files.
-
-```bash
-python apply_function_credential_config.py
-```
+The evaluation script will automatically search for dataset files in the default `./data/` directory and replace the placeholder values with the actual API keys you provided in the `.env` file.
 
 ## Evaluating different models on the BFCL
 
diff --git a/berkeley-function-call-leaderboard/apply_function_credential_config.py b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
similarity index 79%
rename from berkeley-function-call-leaderboard/apply_function_credential_config.py
rename to berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
index 6c0e8fe70..bfe220267 100644
--- a/berkeley-function-call-leaderboard/apply_function_credential_config.py
+++ b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
@@ -1,17 +1,11 @@
 import glob
 import json
-import argparse
 import os
-from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError
-from dotenv import load_dotenv
 
-parser = argparse.ArgumentParser(description="Replace placeholders in the function credential config file.")
-parser.add_argument("--input-path", help="Path to the function credential config file. Can be a file or a directory.")
-parser.add_argument("--output-path", help="Path to the output file.")
-args = parser.parse_args()
+from bfcl.constant import PROMPT_PATH
+from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError
 
-# Load the actual API keys
-load_dotenv()
+# Load the actual API keys, and verify that they are present
 ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY")
 PLACEHOLDERS = {}
 for var in ENV_VARS:
@@ -19,6 +13,7 @@
         raise NoAPIKeyError()
 
     PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)
+print("All API keys are present.")
 
 
 def replace_placeholders(data):
@@ -57,16 +52,16 @@ def process_file(input_file_path, output_file_path):
                 # Handle the case where a line is not a valid JSON object
                 print("Invalid JSON line skipped.")
                 continue
-    
+
     # Write the modified data to the output file
     with open(output_file_path, "w") as f:
         for i, modified_line in enumerate(modified_data):
             f.write(modified_line)
             if i < len(modified_data) - 1:  # Check against the length of modified_data
-                f.write("\n")        
+                f.write("\n")
     print(f"All placeholders have been replaced for {input_file_path} 🦍.")
-    
-    
+
+
 def process_dir(input_dir, output_dir):
     # This function does not support nested directories
     # To support nested directories, refer to this commit:
@@ -80,24 +75,17 @@ def process_dir(input_dir, output_dir):
         file_name = os.path.basename(input_file_path)
         output_file_path = os.path.join(output_dir, file_name)
         process_file(input_file_path, output_file_path)
-        
-
-if __name__ == "__main__":
-    # Verify all values are provided
-    for key, value in PLACEHOLDERS.items():
-        if value == "":
-            raise NoAPIKeyError()
-    print("All API keys are present.")
-    
-    input_path = args.input_path
+
+
+def apply_function_credential_config(input_path=None, output_path=None):
+
     if input_path is None:
-        input_path = "./data/" 
-    
-    output_path = args.output_path
+        input_path = PROMPT_PATH
+
     if output_path is None:
         output_path = input_path
-    
+
     if os.path.isdir(input_path):
         process_dir(input_path, output_path)
     else:
-        process_file(input_path, output_path)
\ No newline at end of file
+        process_file(input_path, output_path)

From 733cdcd18b461baf73621aa5ead7630ec49328fe Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com>
Date: Sun, 6 Oct 2024 22:31:48 -0700
Subject: [PATCH 2/4] fix all hard-coded relative file path

---
 .../bfcl/constant.py                          | 19 ++++++
 .../bfcl/eval_checker/constant.py             | 12 ++++
 .../bfcl/eval_checker/eval_runner.py          | 59 +++++++++++--------
 .../bfcl/eval_checker/eval_runner_helper.py   | 17 ++----
 .../executable_eval/executable_checker.py     | 19 +++---
 .../bfcl/model_handler/base_handler.py        | 10 ++--
 .../openfunctions_evaluation.py               | 35 ++++++-----
 .../pyproject.toml                            |  1 +
 8 files changed, 108 insertions(+), 64 deletions(-)

diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py
index 3e1022c60..f5ebe52f1 100644
--- a/berkeley-function-call-leaderboard/bfcl/constant.py
+++ b/berkeley-function-call-leaderboard/bfcl/constant.py
@@ -1,5 +1,15 @@
+from pathlib import Path
+
+# NOTE: These paths are relative to the `bfcl` directory where this script is located.
+RESULT_PATH = "../result/"
+PROMPT_PATH = "../data/"
+POSSIBLE_ANSWER_PATH = "../data/possible_answer/"
+SCORE_PATH = "../score/"
+DOTENV_PATH = "../.env"
+
 VERSION_PREFIX = "BFCL_v3"
 
+# These are in the PROMPT_PATH
 TEST_FILE_MAPPING = {
     "exec_simple": f"{VERSION_PREFIX}_exec_simple.json",
     "exec_parallel": f"{VERSION_PREFIX}_exec_parallel.json",
@@ -164,3 +174,12 @@
         "live_relevance",
     ],
 }
+
+
+# Construct the full path to use by other scripts
+script_dir = Path(__file__).parent
+RESULT_PATH = script_dir / RESULT_PATH
+PROMPT_PATH = script_dir / PROMPT_PATH
+POSSIBLE_ANSWER_PATH = script_dir / POSSIBLE_ANSWER_PATH
+SCORE_PATH = script_dir / SCORE_PATH
+DOTENV_PATH = script_dir / DOTENV_PATH
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
index ac3c2ef55..c9e18b392 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
@@ -1,5 +1,8 @@
+from pathlib import Path
+
 REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
 
+# These two files are for the API status sanity check
 REST_API_GROUND_TRUTH_FILE_PATH = (
     "./executable_eval/data/api_status_check_ground_truth_REST.json"
 )
@@ -7,6 +10,9 @@
     "./executable_eval/data/api_status_check_ground_truth_executable.json"
 )
 
+# This is the ground truth file for the `rest` test category
+REST_EVAL_GROUND_TRUTH_PATH = "./executable_eval/data/rest-eval-response_v5.jsonl"
+
 COLUMNS_NON_LIVE = [
     "Rank",
     "Model",
@@ -99,3 +105,9 @@
 
 RED_FONT = "\033[91m"
 RESET = "\033[0m"
+
+# Construct the full path for other modules to use
+script_dir = Path(__file__).parent
+REST_API_GROUND_TRUTH_FILE_PATH = script_dir / REST_API_GROUND_TRUTH_FILE_PATH
+EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = script_dir / EXECTUABLE_API_GROUND_TRUTH_FILE_PATH
+REST_EVAL_GROUND_TRUTH_PATH = script_dir / REST_EVAL_GROUND_TRUTH_PATH
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
index 1427f13bb..24ce8fc91 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -1,6 +1,16 @@
 import argparse
 
-from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, VERSION_PREFIX
+from bfcl._apply_function_credential_config import apply_function_credential_config
+from bfcl.constant import (
+    DOTENV_PATH,
+    POSSIBLE_ANSWER_PATH,
+    PROMPT_PATH,
+    RESULT_PATH,
+    SCORE_PATH,
+    TEST_COLLECTION_MAPPING,
+    TEST_FILE_MAPPING,
+    VERSION_PREFIX,
+)
 from bfcl.eval_checker.ast_eval.ast_checker import ast_checker
 from bfcl.eval_checker.eval_runner_helper import *
 from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError
@@ -16,7 +26,9 @@
 from dotenv import load_dotenv
 from tqdm import tqdm
 
-# NOTE: This file should be run in the `eval_checker` directory
+# A dictionary to store the evaluation scores.
+# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count
+LEADERBOARD_TABLE = {}
 
 
 def multi_turn_runner(
@@ -124,7 +136,7 @@ def multi_turn_runner(
         },
     )
     output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    output_file_dir = SCORE_PATH / model_name
     write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
 
     return accuracy, len(model_result)
@@ -227,7 +239,7 @@ def executable_file_runner(handler, model_result, prompt, model_name, test_categ
         },
     )
     output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    output_file_dir = SCORE_PATH / model_name
     write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
 
     return accuracy, len(model_result)
@@ -299,7 +311,7 @@ def relevance_file_runner(handler, model_result, prompt, model_name, test_catego
         },
     )
     output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    output_file_dir = SCORE_PATH / model_name
     write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
 
     return accuracy, len(model_result)
@@ -393,12 +405,18 @@ def ast_file_runner(
         },
     )
     output_file_name = f"{VERSION_PREFIX}_{test_category}_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    output_file_dir = SCORE_PATH / model_name
     write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
 
     return accuracy, len(model_result)
 
 
+if not RESULT_PATH.exists():
+    RESULT_PATH.mkdir(parents=True, exist_ok=True)
+if not SCORE_PATH.exists():
+    SCORE_PATH.mkdir(parents=True, exist_ok=True)
+
+
 #### Main runner function ####
 def runner(model_names, test_categories, api_sanity_check):
 
@@ -408,6 +426,7 @@ def runner(model_names, test_categories, api_sanity_check):
     API_TESTED = False
     API_STATUS_ERROR_REST = None
     API_STATUS_ERROR_EXECUTABLE = None
+    HAS_REPLACED_API_CREDENTIALS = False
 
     # Before running the executable evaluation, we need to get the expected output from the ground truth.
     # So we need a list of all the test categories that we have ran the ground truth evaluation on.
@@ -415,7 +434,7 @@ def runner(model_names, test_categories, api_sanity_check):
     EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = []
 
     # Get a list of all entries in the folder
-    entries = os.scandir(INPUT_PATH)
+    entries = os.scandir(RESULT_PATH)
 
     # Filter out the subdirectories
     subdirs = [entry.path for entry in entries if entry.is_dir()]
@@ -423,7 +442,7 @@ def runner(model_names, test_categories, api_sanity_check):
     # Traverse each subdirectory
     for subdir in subdirs:
 
-        model_name = subdir.split(INPUT_PATH)[1]
+        model_name = subdir.split(RESULT_PATH)[1]
         if model_names is not None and model_name not in model_names:
             continue
 
@@ -495,6 +514,10 @@ def runner(model_names, test_categories, api_sanity_check):
 
                     API_TESTED = True
 
+                if not HAS_REPLACED_API_CREDENTIALS:
+                    apply_function_credential_config(input_path=PROMPT_PATH)
+                    HAS_REPLACED_API_CREDENTIALS = True
+
                 if (
                     test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN
                     and not is_rest(test_category)
@@ -557,9 +580,9 @@ def runner(model_names, test_categories, api_sanity_check):
 
     # This function reads all the score files from local folder and updates the leaderboard table.
     # This is helpful when you only want to run the evaluation for a subset of models and test categories.
-    update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH)
+    update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, SCORE_PATH)
     # Write the leaderboard table to a file
-    generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH, model_names, test_categories)
+    generate_leaderboard_csv(LEADERBOARD_TABLE, SCORE_PATH, model_names, test_categories)
 
     # Clean up the executable expected output files
     # They should be re-generated the next time the evaluation is run
@@ -570,23 +593,13 @@ def runner(model_names, test_categories, api_sanity_check):
     )
 
     print(
-        f"🏁 Evaluation completed. See {os.path.abspath(OUTPUT_PATH + 'data_overall.csv')} for evaluation results on BFCL V2."
+        f"🏁 Evaluation completed. See {SCORE_PATH / 'data_overall.csv'} for evaluation results on BFCL V2."
     )
     print(
-        f"See {os.path.abspath(OUTPUT_PATH + 'data_live.csv')} and {os.path.abspath(OUTPUT_PATH + 'data_non_live.csv')} for evaluation results on BFCL V2 Live and Non-Live categories respectively."
+        f"See {SCORE_PATH / 'data_live.csv'} and {SCORE_PATH / 'data_non_live.csv'} for evaluation results on BFCL V2 Live and Non-Live categories respectively."
     )
 
 
-INPUT_PATH = "../../result/"
-PROMPT_PATH = "../../data/"
-POSSIBLE_ANSWER_PATH = "../../data/possible_answer/"
-OUTPUT_PATH = "../../score/"
-
-# A dictionary to store the results
-# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count
-LEADERBOARD_TABLE = {}
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Process two lists of strings.")
 
@@ -629,5 +642,5 @@ def runner(model_names, test_categories, api_sanity_check):
             # We patch it here to avoid confusing the user.
             model_names.append(model_name.replace("/", "_"))
 
-    load_dotenv(dotenv_path="../../.env", verbose=True, override=True)  # Load the .env file
+    load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True)  # Load the .env file
     runner(model_names, test_categories, api_sanity_check)
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
index e6988a2ce..7ac4c7538 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
@@ -3,13 +3,13 @@
 import os
 import re
 import statistics
-import subprocess
 
 import numpy as np
+from bfcl._apply_function_credential_config import apply_function_credential_config
+from bfcl.constant import VERSION_PREFIX
 from bfcl.eval_checker.constant import *
 from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError
 from bfcl.eval_checker.model_metadata import *
-from bfcl.constant import VERSION_PREFIX
 from bfcl.model_handler.handler_map import handler_map
 from tqdm import tqdm
 
@@ -148,12 +148,7 @@ def api_status_sanity_check_rest():
     ground_truth_dummy = load_file(REST_API_GROUND_TRUTH_FILE_PATH)
 
     # Use the ground truth data to make sure the API is working correctly
-    command = f"cd ../.. ; python apply_function_credential_config.py --input-path ./bfcl/eval_checker/{REST_API_GROUND_TRUTH_FILE_PATH};"
-    try:
-        subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
-    except subprocess.CalledProcessError as e:
-        write_list_of_dicts_to_file(REST_API_GROUND_TRUTH_FILE_PATH, ground_truth_dummy)
-        raise RuntimeError(e.stderr) from e
+    apply_function_credential_config(input_path=REST_API_GROUND_TRUTH_FILE_PATH)
 
     ground_truth_replaced = load_file(REST_API_GROUND_TRUTH_FILE_PATH)
     write_list_of_dicts_to_file(REST_API_GROUND_TRUTH_FILE_PATH, ground_truth_dummy)
@@ -620,7 +615,7 @@ def generate_leaderboard_csv(
 
     data_non_live.insert(0, COLUMNS_NON_LIVE)
 
-    filepath = os.path.join(output_path, "data_non_live.csv")
+    filepath = output_path / "data_non_live.csv"
     with open(filepath, "w") as f:
         for i, row in enumerate(data_non_live):
             if i < len(data_non_live) - 1:
@@ -637,7 +632,7 @@ def generate_leaderboard_csv(
 
     data_live.insert(0, COLUMNS_LIVE)
 
-    filepath = os.path.join(output_path, "data_live.csv")
+    filepath = output_path / "data_live.csv"
     with open(filepath, "w") as f:
         for i, row in enumerate(data_live):
             if i < len(data_live) - 1:
@@ -660,7 +655,7 @@ def generate_leaderboard_csv(
 
     data_combined.insert(0, COLUMNS_OVERALL)
 
-    filepath = os.path.join(output_path, "data_overall.csv")
+    filepath = output_path / "data_overall.csv"
     with open(filepath, "w") as f:
         for i, row in enumerate(data_combined):
             if i < len(data_combined) - 1:
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py
index 922d041d4..c178dcefa 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py
@@ -1,16 +1,15 @@
-from bfcl.eval_checker.constant import REAL_TIME_MATCH_ALLOWED_DIFFERENCE
-from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError
-
-import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
-import time
 import json
+import time
 
-
-#### Constants ####
-EVAL_GROUND_TRUTH_PATH = (
-    "./executable_eval/data/rest-eval-response_v5.jsonl"  # Ground truth file for v5 for rest execution
+import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
+from bfcl.eval_checker.constant import (
+    REAL_TIME_MATCH_ALLOWED_DIFFERENCE,
+    REST_EVAL_GROUND_TRUTH_PATH,
 )
-with open(EVAL_GROUND_TRUTH_PATH, "r") as f:
+from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError
+
+# Load the ground truth data for the `rest` test category
+with open(REST_EVAL_GROUND_TRUTH_PATH, "r") as f:
     EVAL_GROUND_TRUTH = f.readlines()
 
 
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py
index 421e2e1cb..1e1106709 100644
--- a/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py
+++ b/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py
@@ -1,17 +1,16 @@
 import json
-import os
 import time
 
+from bfcl.constant import RESULT_PATH, VERSION_PREFIX
 from bfcl.eval_checker.multi_turn_eval.multi_turn_utils import (
     execute_multi_turn_func_call,
     is_empty_execute_response,
 )
 from bfcl.model_handler.constant import (
-    MAXIMUM_ROUND_LIMIT,
     DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
     DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
+    MAXIMUM_ROUND_LIMIT,
 )
-from bfcl.constant import VERSION_PREFIX
 from bfcl.model_handler.model_style import ModelStyle
 
 
@@ -480,7 +479,8 @@ def decode_execute(self, result):
 
     def write(self, result):
         model_name_dir = self.model_name.replace("/", "_")
-        os.makedirs(f"./result/{model_name_dir}", exist_ok=True)
+        model_result_dir = RESULT_PATH / model_name_dir
+        model_name_dir.mkdir(parents=True, exist_ok=True)
 
         if type(result) is dict:
             result = [result]
@@ -488,7 +488,7 @@ def write(self, result):
         for entry in result:
             test_category = entry["id"].rsplit("_", 1)[0]
             file_to_write = f"{VERSION_PREFIX}_{test_category}_result.json"
-            file_to_write = f"./result/{model_name_dir}/{file_to_write}"
+            file_to_write = model_result_dir / file_to_write
             with open(file_to_write, "a+") as f:
                 try:
                     f.write(json.dumps(entry) + "\n")
diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py
index a7b929aa4..f830ce995 100644
--- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py
+++ b/berkeley-function-call-leaderboard/openfunctions_evaluation.py
@@ -5,7 +5,15 @@
 import time
 from concurrent.futures import ThreadPoolExecutor
 
-from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING
+from bfcl._apply_function_credential_config import apply_function_credential_config
+from bfcl.constant import (
+    DOTENV_PATH,
+    PROMPT_PATH,
+    RESULT_PATH,
+    TEST_COLLECTION_MAPPING,
+    TEST_FILE_MAPPING,
+)
+from bfcl.eval_checker.eval_runner_helper import is_executable
 from bfcl.model_handler.handler_map import handler_map
 from bfcl.model_handler.model_style import ModelStyle
 from dotenv import load_dotenv
@@ -82,26 +90,19 @@ def parse_test_category_argument(test_category_args):
 
 def collect_test_cases(test_filename_total, model_name):
     model_name_dir = model_name.replace("/", "_")
+    model_result_dir = RESULT_PATH / model_name_dir
+
     test_cases_total = []
     for file_to_open in test_filename_total:
         test_cases = []
-        with open("./data/" + file_to_open) as f:
+        with open(PROMPT_PATH / file_to_open) as f:
             for line in f:
                 test_cases.append(json.loads(line))
 
         existing_result = []
-        if os.path.exists(
-            "./result/"
-            + model_name_dir
-            + "/"
-            + file_to_open.replace(".json", "_result.json")
-        ):
-            with open(
-                "./result/"
-                + model_name_dir
-                + "/"
-                + file_to_open.replace(".json", "_result.json")
-            ) as f:
+        result_file_path = model_result_dir / file_to_open.replace(".json", "_result.json")
+        if result_file_path.exists():
+            with open(result_file_path) as f:
                 for line in f:
                     existing_result.append(json.loads(line))
 
@@ -194,7 +195,7 @@ def generate_results(args, model_name, test_cases_total):
 
 
 if __name__ == "__main__":
-    load_dotenv(dotenv_path="./.env", verbose=True, override=True)  # Load the .env file
+    load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True)  # Load the .env file
 
     args = get_args()
 
@@ -207,6 +208,10 @@ def generate_results(args, model_name, test_cases_total):
 
     print(f"Generating results for {args.model} on test category: {test_name_total}.")
 
+    # Apply function credential config if any of the test categories are executable
+    if any([is_executable(category) for category in test_name_total]):
+        apply_function_credential_config(input_path=PROMPT_PATH)
+
     for model_name in args.model:
         if (
             os.getenv("USE_COHERE_OPTIMIZATION") == "True"
diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml
index b51cea2fa..1a6dbdc87 100644
--- a/berkeley-function-call-leaderboard/pyproject.toml
+++ b/berkeley-function-call-leaderboard/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "anthropic==0.31.1",
     "cohere==5.5.8",
     "google-cloud-aiplatform==1.65.0",
+    "pathlib",
 ]
 
 [tool.setuptools.packages.find]

From 82859eb8e2fdca17c3f249791d4b888f991bee03 Mon Sep 17 00:00:00 2001
From: Huanzhi Mao <huanzhimao@gmail.com>
Date: Mon, 7 Oct 2024 17:52:29 -0700
Subject: [PATCH 3/4] clean up

---
 .../bfcl/_apply_function_credential_config.py | 16 ++++++-------
 .../bfcl/constant.py                          | 13 ++++++----
 .../bfcl/eval_checker/constant.py             |  6 ++---
 .../bfcl/eval_checker/eval_runner.py          | 24 ++++---------------
 .../bfcl/eval_checker/eval_runner_helper.py   | 23 +++++++++---------
 .../bfcl/model_handler/base_handler.py        |  2 +-
 6 files changed, 34 insertions(+), 50 deletions(-)

diff --git a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
index bfe220267..1a5059937 100644
--- a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
+++ b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
@@ -5,15 +5,8 @@
 from bfcl.constant import PROMPT_PATH
 from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError
 
-# Load the actual API keys, and verify that they are present
 ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY")
 PLACEHOLDERS = {}
-for var in ENV_VARS:
-    if os.getenv(var) == "":
-        raise NoAPIKeyError()
-
-    PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)
-print("All API keys are present.")
 
 
 def replace_placeholders(data):
@@ -59,14 +52,13 @@ def process_file(input_file_path, output_file_path):
             f.write(modified_line)
             if i < len(modified_data) - 1:  # Check against the length of modified_data
                 f.write("\n")
-    print(f"All placeholders have been replaced for {input_file_path} 🦍.")
 
 
 def process_dir(input_dir, output_dir):
     # This function does not support nested directories
     # To support nested directories, refer to this commit:
     # https://github.com/ShishirPatil/gorilla/pull/508/commits/8b1e35590e5bce3bd52a7c6405775b1ce4a64945
-    print(f"Input directory: {input_dir}")
+
     # Get a list of all entries in the folder
     entries = os.scandir(input_dir)
 
@@ -78,6 +70,11 @@ def process_dir(input_dir, output_dir):
 
 
 def apply_function_credential_config(input_path=None, output_path=None):
+    # Load the actual API keys, and verify that they are present
+    for var in ENV_VARS:
+        if os.getenv(var) == "":
+            raise NoAPIKeyError()
+        PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)
 
     if input_path is None:
         input_path = PROMPT_PATH
@@ -89,3 +86,4 @@ def apply_function_credential_config(input_path=None, output_path=None):
         process_dir(input_path, output_path)
     else:
         process_file(input_path, output_path)
+    print("All placeholders API keys have been replaced. 🦍")
diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py
index f5ebe52f1..dbeaab0ec 100644
--- a/berkeley-function-call-leaderboard/bfcl/constant.py
+++ b/berkeley-function-call-leaderboard/bfcl/constant.py
@@ -178,8 +178,11 @@
 
 # Construct the full path to use by other scripts
 script_dir = Path(__file__).parent
-RESULT_PATH = script_dir / RESULT_PATH
-PROMPT_PATH = script_dir / PROMPT_PATH
-POSSIBLE_ANSWER_PATH = script_dir / POSSIBLE_ANSWER_PATH
-SCORE_PATH = script_dir / SCORE_PATH
-DOTENV_PATH = script_dir / DOTENV_PATH
+RESULT_PATH = (script_dir / RESULT_PATH).resolve()
+PROMPT_PATH = (script_dir / PROMPT_PATH).resolve()
+POSSIBLE_ANSWER_PATH = (script_dir / POSSIBLE_ANSWER_PATH).resolve()
+SCORE_PATH = (script_dir / SCORE_PATH).resolve()
+DOTENV_PATH = (script_dir / DOTENV_PATH).resolve()
+
+RESULT_PATH.mkdir(parents=True, exist_ok=True)
+SCORE_PATH.mkdir(parents=True, exist_ok=True)
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
index c9e18b392..f5d6524fd 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
@@ -108,6 +108,6 @@
 
 # Construct the full path for other modules to use
 script_dir = Path(__file__).parent
-REST_API_GROUND_TRUTH_FILE_PATH = script_dir / REST_API_GROUND_TRUTH_FILE_PATH
-EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = script_dir / EXECTUABLE_API_GROUND_TRUTH_FILE_PATH
-REST_EVAL_GROUND_TRUTH_PATH = script_dir / REST_EVAL_GROUND_TRUTH_PATH
+REST_API_GROUND_TRUTH_FILE_PATH = (script_dir / REST_API_GROUND_TRUTH_FILE_PATH).resolve()
+EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = (script_dir / EXECTUABLE_API_GROUND_TRUTH_FILE_PATH).resolve()
+REST_EVAL_GROUND_TRUTH_PATH = (script_dir / REST_EVAL_GROUND_TRUTH_PATH).resolve()
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
index 24ce8fc91..8001fdfa6 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -1,6 +1,5 @@
 import argparse
 
-from bfcl._apply_function_credential_config import apply_function_credential_config
 from bfcl.constant import (
     DOTENV_PATH,
     POSSIBLE_ANSWER_PATH,
@@ -411,12 +410,6 @@ def ast_file_runner(
     return accuracy, len(model_result)
 
 
-if not RESULT_PATH.exists():
-    RESULT_PATH.mkdir(parents=True, exist_ok=True)
-if not SCORE_PATH.exists():
-    SCORE_PATH.mkdir(parents=True, exist_ok=True)
-
-
 #### Main runner function ####
 def runner(model_names, test_categories, api_sanity_check):
 
@@ -426,7 +419,6 @@ def runner(model_names, test_categories, api_sanity_check):
     API_TESTED = False
     API_STATUS_ERROR_REST = None
     API_STATUS_ERROR_EXECUTABLE = None
-    HAS_REPLACED_API_CREDENTIALS = False
 
     # Before running the executable evaluation, we need to get the expected output from the ground truth.
     # So we need a list of all the test categories that we have ran the ground truth evaluation on.
@@ -434,28 +426,24 @@ def runner(model_names, test_categories, api_sanity_check):
     EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = []
 
     # Get a list of all entries in the folder
-    entries = os.scandir(RESULT_PATH)
+    entries = RESULT_PATH.iterdir()
 
     # Filter out the subdirectories
-    subdirs = [entry.path for entry in entries if entry.is_dir()]
+    subdirs = [entry for entry in entries if entry.is_dir()]
 
     # Traverse each subdirectory
     for subdir in subdirs:
 
-        model_name = subdir.split(RESULT_PATH)[1]
+        model_name = subdir.relative_to(RESULT_PATH).name
         if model_names is not None and model_name not in model_names:
             continue
 
         model_name_escaped = model_name.replace("_", "/")
 
-        # Pattern to match JSON files in this subdirectory
-        json_files_pattern = os.path.join(subdir, "*.json")
-
         print(f"🦍 Model: {model_name}")
 
         # Find and process all JSON files in the subdirectory
-        for model_result_json in glob.glob(json_files_pattern):
-
+        for model_result_json in subdir.glob("*.json"):
             test_category = extract_test_category(model_result_json)
             if test_categories is not None and test_category not in test_categories:
                 continue
@@ -514,10 +502,6 @@ def runner(model_names, test_categories, api_sanity_check):
 
                     API_TESTED = True
 
-                if not HAS_REPLACED_API_CREDENTIALS:
-                    apply_function_credential_config(input_path=PROMPT_PATH)
-                    HAS_REPLACED_API_CREDENTIALS = True
-
                 if (
                     test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN
                     and not is_rest(test_category)
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
index 7ac4c7538..77cde32f0 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
@@ -1,8 +1,9 @@
-import glob
 import json
 import os
 import re
 import statistics
+from pathlib import Path
+from typing import Union
 
 import numpy as np
 from bfcl._apply_function_credential_config import apply_function_credential_config
@@ -14,7 +15,8 @@
 from tqdm import tqdm
 
 
-def extract_test_category(input_string):
+def extract_test_category(input_string: Union[str, Path]) -> str:
+    input_string = str(input_string)
     pattern = fr".*{VERSION_PREFIX}_(\w+?)(?:_score|_result)?\.json"
     match = re.search(pattern, input_string)
 
@@ -25,9 +27,8 @@ def extract_test_category(input_string):
         raise ValueError(f"Could not extract the test category from the input string: {input_string}")
 
 
-def find_file_with_suffix(folder_path, suffix):
-    json_files_pattern = os.path.join(folder_path, "*.json")
-    for json_file in glob.glob(json_files_pattern):
+def find_file_with_suffix(folder_path: Path, suffix: str) -> Path:
+    for json_file in folder_path.glob("*.json"):
         if extract_test_category(json_file) == suffix:
             return json_file
     raise FileNotFoundError(f"No JSON file found with suffix: {suffix}")
@@ -789,20 +790,18 @@ def check_all_category_present(category_status, eval_models=None, eval_categorie
     return found_issues
 
 
-def update_leaderboard_table_with_score_file(leaderboard_table, score_path):
+def update_leaderboard_table_with_score_file(leaderboard_table, score_path: Path) -> None:
 
-    entries = os.scandir(score_path)
+    entries = score_path.iterdir()
 
     # Filter out the subdirectories
-    subdirs = [entry.path for entry in entries if entry.is_dir()]
+    subdirs = [entry for entry in entries if entry.is_dir()]
 
     # Traverse each subdirectory
     for subdir in subdirs:
-        # Pattern to match JSON files in this subdirectory
-        json_files_pattern = os.path.join(subdir, "*.json")
-        model_name = subdir.split(score_path)[1]
+        model_name = subdir.relative_to(score_path).name
         # Find and process all JSON files in the subdirectory
-        for model_score_json in glob.glob(json_files_pattern):
+        for model_score_json in subdir.glob("*.json"):
             metadata = load_file(model_score_json)[0]
             accuracy, total_count = metadata["accuracy"], metadata["total_count"]
             test_category = extract_test_category(model_score_json)
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py
index 1e1106709..7a19c26c0 100644
--- a/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py
+++ b/berkeley-function-call-leaderboard/bfcl/model_handler/base_handler.py
@@ -480,7 +480,7 @@ def decode_execute(self, result):
     def write(self, result):
         model_name_dir = self.model_name.replace("/", "_")
         model_result_dir = RESULT_PATH / model_name_dir
-        model_name_dir.mkdir(parents=True, exist_ok=True)
+        model_result_dir.mkdir(parents=True, exist_ok=True)
 
         if type(result) is dict:
             result = [result]

From 71769b116431f530f4226e6c05b7e91d485497b4 Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com>
Date: Mon, 7 Oct 2024 21:43:27 -0700
Subject: [PATCH 4/4] better error message

---
 .../bfcl/_apply_function_credential_config.py                   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
index 1a5059937..0cc282144 100644
--- a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
+++ b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py
@@ -72,7 +72,7 @@ def process_dir(input_dir, output_dir):
 def apply_function_credential_config(input_path=None, output_path=None):
     # Load the actual API keys, and verify that they are present
     for var in ENV_VARS:
-        if os.getenv(var) == "":
+        if var not in os.environ or not os.getenv(var):
             raise NoAPIKeyError()
         PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)