diff --git a/.gitignore b/.gitignore index 02096f5be..e6c74d18b 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,4 @@ berkeley-function-call-leaderboard/score/ .direnv/ .venv +.cache \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/.env.example b/berkeley-function-call-leaderboard/.env.example new file mode 100644 index 000000000..b1b0a437f --- /dev/null +++ b/berkeley-function-call-leaderboard/.env.example @@ -0,0 +1,23 @@ +# [OPTIONAL] Required for downloading gated hugging face models +HUGGING_FACE_HUB_TOKEN= + +# [OPTIONAL] Required for LLM generation step +# Provide the API key for the model(s) you intend to use +OPENAI_API_KEY=sk-XXXXXX +MISTRAL_API_KEY= +FIREWORKS_API_KEY= +ANTHROPIC_API_KEY= +NVIDIA_API_KEY=nvapi-XXXXXX +GEMINI_GCP_PROJECT_ID= + +COHERE_API_KEY= +USE_COHERE_OPTIMIZATION=False # True/False + +DATABRICKS_API_KEY= +DATABRICKS_AZURE_ENDPOINT_URL= + +# [OPTIONAL] Required for evaluation of `executable` test group +RAPID_API_KEY= +EXCHANGERATE_API_KEY= +OMDB_API_KEY= +GEOCODE_API_KEY= diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md new file mode 100644 index 000000000..7479358e4 --- /dev/null +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -0,0 +1,42 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +* [July 3, 2024] [#489](https://github.com/ShishirPatil/gorilla/pull/489): Add new model `nvidia/nemotron-4-340b-instruct` to the leaderboard. +* [June 18, 2024] [#470](https://github.com/ShishirPatil/gorilla/pull/470): Add new model `firefunction-v2-FC` to the leaderboard. +* [June 15, 2024] [#437](https://github.com/ShishirPatil/gorilla/pull/437): Fix prompting issues for `Nexusflow-Raven-v2 (FC)`. +* [June 7, 2024] [#407](https://github.com/ShishirPatil/gorilla/pull/407), [#462](https://github.com/ShishirPatil/gorilla/pull/462): Update the AST evaluation logic to allow the use of `int` values for Python parameters expecting `float` values. This is to accommodate the Python auto-conversion feature from `int` to `float`. +* [May 14, 2024] [#426](https://github.com/ShishirPatil/gorilla/pull/426): + - Add the following new models to the leaderboard: + + `gpt-4o-2024-05-13` + + `gpt-4o-2024-05-13-FC` + + `gemini-1.5-pro-preview-0514` + + `gemini-1.5-flash-preview-0514` + - Update price for the following models: + + All Gemini Series + + `Claude-2.1 (Prompt)` and `Claude-instant-1.2 (Prompt)` + + `Mistral-large` and `Mistral-Small` + + `GPT-3.5-Turbo-0125` +* [May 8, 2024] [#406](https://github.com/ShishirPatil/gorilla/pull/406) and [#421](https://github.com/ShishirPatil/gorilla/pull/421): Update the `gemini_handler.py` to better handle parallel function calls for Gemini models. +* [May 6, 2024] [#412](https://github.com/ShishirPatil/gorilla/pull/412): Bug fix in evaluation dataset for AST categories. This includes updates to both prompts and function docs. +* [May 2, 2024] [#405](https://github.com/ShishirPatil/gorilla/pull/405): Bug fix in the possible answers for the AST Simple evaluation dataset. Prompt and function docs are not affected. +* [April 28, 2024] [#397](https://github.com/ShishirPatil/gorilla/pull/397): Add new model `snowflake/arctic` to the leaderboard. Note that there are multiple ways to inference the model, and we choose to do it via Nvidia API catalog. +* [April 27, 2024] [#390](https://github.com/ShishirPatil/gorilla/pull/390): Bug fix in cost and latency calculation for open-source models, which are now all calculated when serving the model with [vLLM](https://github.com/vllm-project/vllm) using 8 V100 GPUs for consistency. $$\text{Cost} = \text{Latency per 1000 function call} * (\text{8xV100 azure-pay-as-you-go-price per hour / 3600})$$ +* [April 25, 2024] [#386](https://github.com/ShishirPatil/gorilla/pull/386): Add 5 new models to the leaderboard: `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `gemini-1.5-pro-preview-0409`, `command-r-plus`, `command-r-plus-FC`. +* [April 19, 2024] [#377](https://github.com/ShishirPatil/gorilla/pull/377): + - Bug fix for the evaluation dataset in the executable test categories. This includes updates to both prompts and function docs. + - The `evaluation_result` field has been removed to accommodate the variability in API execution results across different evaluation runs. Instead, a human-verified `ground_truth` is now included for the executable test categories. During each evaluation run, `evaluation_result` is generated anew using the `ground_truth`, and then compared against the model output. + - A stricter metric has been adopted when using the `structural_match` (aka. type match) evaluation criteria ---- For `list` results, the lengths are compared; for `dict` results, the keys are matched. This is to account for the fast-changing nature of some of the real-time API results while ensuring the evaluation remains meaningful. + - Added another evaluation criteria `real_time_match` for the executable category, which is a looser form of `exact_match` specifically for numerical execution results. The execution result must be within a certain percentage threshold (20%) from the expected result to accommodate the live updates of API responses. User can change this threshold value in `eval_checker_constant.py`. +* [April 18, 2024] [#375](https://github.com/ShishirPatil/gorilla/pull/375): A more comprehensive API sanity check is included; the APIs that are invoked during the non-REST executable evaluation process will also be checked for their availability before running the evaluation. Also, add support for the shortcut `-s` for the `--skip-api-sanity-check` flag, based on the community feedback. +* [April 16, 2024] [#366](https://github.com/ShishirPatil/gorilla/pull/366): Switch to use Anthropic's new Tool Use Beta `tools-2024-04-04` when generating Claude 3 FC series data. `gpt-4-turbo-2024-04-09` and `gpt-4-turbo-2024-04-09-FC` are also added to the leaderboard. +* [April 11, 2024] [#347](https://github.com/ShishirPatil/gorilla/pull/347): Add the 95th percentile latency to the leaderboard statistics. This metric is useful for understanding the latency distribution of the models, especially the worst-case scenario. +* [April 10, 2024] [#339](https://github.com/ShishirPatil/gorilla/pull/339): Introduce REST API sanity check for the REST executable test category. It ensures that all the API endpoints involved during the execution evaluation process are working properly. If any of them are not behaving as expected, the evaluation process will be stopped by default as the result will be inaccurate. Users can choose to bypass this check by setting the `--skip-api-sanity-check` flag or `-s` for short. +* [April 9, 2024] [#338](https://github.com/ShishirPatil/gorilla/pull/338): Bug fix in the evaluation datasets (including both prompts and function docs). Bug fix for possible answers as well. +* [April 8, 2024] [#330](https://github.com/ShishirPatil/gorilla/pull/330): Fixed an oversight that was introduced in [#299](https://github.com/ShishirPatil/gorilla/pull/299). For function-calling (FC) models that cannot take `float` type in input, when the parameter type is a `float`, the evaluation procedure will convert that type to `number` in the model input and mention in the parameter description that `This is a float type value.`. An additional field `format: float` will also be included in the model input to make it clear about the type. Updated the model handler for Claude, Mistral, and OSS to better parse the model output. +* [April 8, 2024] [#327](https://github.com/ShishirPatil/gorilla/pull/327): Add new model `NousResearch/Hermes-2-Pro-Mistral-7B` to the leaderboard. +* [April 3, 2024] [#309](https://github.com/ShishirPatil/gorilla/pull/309): Bug fix for evaluation dataset possible answers. Implement **string standardization** for the AST evaluation pipeline, i.e. removing white spaces and a subset of punctuations (`,./-_*^`) to make the AST evaluation more robust and accurate. Fixed AST evaluation issue for type `tuple`. Add 2 new models `meetkai/functionary-small-v2.4 (FC)`, `meetkai/functionary-medium-v2.4 (FC)` to the leaderboard. +* [April 1, 2024] [#299](https://github.com/ShishirPatil/gorilla/pull/299): Leaderboard update with new models (`Claude-3-Haiku`, `Databrick-DBRX-Instruct`), more advanced AST evaluation procedure, and updated evaluation datasets. Cost and latency statistics during evaluation are also measured. We also released the manual that our evaluation procedure is based on, available [here](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#metrics). +* [Mar 11, 2024] [#254](https://github.com/ShishirPatil/gorilla/pull/254): Leaderboard update with 3 new models: `Claude-3-Opus-20240229 (Prompt)`, `Claude-3-Sonnet-20240229 (Prompt)`, and `meetkai/functionary-medium-v2.2 (FC)` +* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `gemma`. +* [Feb 29, 2024] [#223](https://github.com/ShishirPatil/gorilla/pull/223): modifications to REST evaluation. diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index 2a3c78b7a..812d4f34b 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -12,9 +12,8 @@ We present Berkeley Function Leaderboard, the **first comprehensive and executab Read more about the technical details and interesting insights in our blog post! ![image](./architecture_diagram.png) -### Install Dependencies -Before generating the leaderboard statistics, you should install dependencies using the following command: +### Install Dependencies ```bash conda create -n BFCL python=3.10 @@ -22,25 +21,6 @@ conda activate BFCL pip install -r requirements.txt # Inside ./berkeley-function-call-leaderboard pip install vllm # If you have vLLM supported GPU(s) and want to run our evaluation data against self-hosted OSS models. ``` -If you plan to evaluate on OSS models, we are using vLLM for inference and refer to https://github.com/vllm-project/vllm for detail. We recommend to inference on at least V100s, A100s, and latest GPUs that are supported by vLLM. - -### Checker Setup (required for Java, JavaScript test categories) -We use `tree-sitter` to do the AST parsing for Java and JavaScript test categories. Thus, you need to install `tree-sitter`. - -The git clones need to be under the `/berkeley-function-call-leaderboard/eval_checker` folder. - -```bash -cd ./eval_checker -git clone https://github.com/tree-sitter/tree-sitter-java.git -git clone https://github.com/tree-sitter/tree-sitter-javascript.git -``` - -Now, move back to `/berkeley-function-call-leaderboard` by `cd ..`, and create two symbolic links to the `tree-sitter-java` and `tree-sitter-javascript` directories. This is required to run `openfunctions_evaluation.py`. - -``` -ln -s eval_checker/tree-sitter-java tree-sitter-java -ln -s eval_checker/tree-sitter-javascript tree-sitter-javascript -``` ## Prepare Evaluation Dataset @@ -50,14 +30,12 @@ To download the evaluation dataset from huggingface, from the current directory huggingface-cli download gorilla-llm/Berkeley-Function-Calling-Leaderboard --local-dir ./data --repo-type dataset ``` - This will download our dataset to `data` repository. ## Evaluation Dataset The evaluation datasets are now stored in the `./data` folder. The possible answers are stored in the `./data/possible_answer` folder. - ## Execution Evaluation Data Post-processing Input your API keys into `function_credential_config.json`, so that the original placeholder values in questions, params, and answers will be cleaned. @@ -233,50 +211,7 @@ For Mistral large and small models, we provide evaluation on both of their `Any` For inferencing `Gemini-1.0-pro`, you need to fill in `model_handler/gemini_handler.py` with your GCP project ID that has access to Vertex AI endpoint. -For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure workspace and setup an endpoint for inference. - - -## Changelog - -* [July 3, 2024] [#489](https://github.com/ShishirPatil/gorilla/pull/489): Add new model `nvidia/nemotron-4-340b-instruct` to the leaderboard. -* [June 18, 2024] [#470](https://github.com/ShishirPatil/gorilla/pull/470): Add new model `firefunction-v2-FC` to the leaderboard. -* [June 15, 2024] [#437](https://github.com/ShishirPatil/gorilla/pull/437): Fix prompting issues for `Nexusflow-Raven-v2 (FC)`. -* [June 7, 2024] [#407](https://github.com/ShishirPatil/gorilla/pull/407), [#462](https://github.com/ShishirPatil/gorilla/pull/462): Update the AST evaluation logic to allow the use of `int` values for Python parameters expecting `float` values. This is to accommodate the Python auto-conversion feature from `int` to `float`. -* [May 14, 2024] [#426](https://github.com/ShishirPatil/gorilla/pull/426): - - Add the following new models to the leaderboard: - + `gpt-4o-2024-05-13` - + `gpt-4o-2024-05-13-FC` - + `gemini-1.5-pro-preview-0514` - + `gemini-1.5-flash-preview-0514` - - Update price for the following models: - + All Gemini Series - + `Claude-2.1 (Prompt)` and `Claude-instant-1.2 (Prompt)` - + `Mistral-large` and `Mistral-Small` - + `GPT-3.5-Turbo-0125` -* [May 8, 2024] [#406](https://github.com/ShishirPatil/gorilla/pull/406) and [#421](https://github.com/ShishirPatil/gorilla/pull/421): Update the `gemini_handler.py` to better handle parallel function calls for Gemini models. -* [May 6, 2024] [#412](https://github.com/ShishirPatil/gorilla/pull/412): Bug fix in evaluation dataset for AST categories. This includes updates to both prompts and function docs. -* [May 2, 2024] [#405](https://github.com/ShishirPatil/gorilla/pull/405): Bug fix in the possible answers for the AST Simple evaluation dataset. Prompt and function docs are not affected. -* [April 28, 2024] [#397](https://github.com/ShishirPatil/gorilla/pull/397): Add new model `snowflake/arctic` to the leaderboard. Note that there are multiple ways to inference the model, and we choose to do it via Nvidia API catalog. -* [April 27, 2024] [#390](https://github.com/ShishirPatil/gorilla/pull/390): Bug fix in cost and latency calculation for open-source models, which are now all calculated when serving the model with [vLLM](https://github.com/vllm-project/vllm) using 8 V100 GPUs for consistency. $$\text{Cost} = \text{Latency per 1000 function call} * (\text{8xV100 azure-pay-as-you-go-price per hour / 3600})$$ -* [April 25, 2024] [#386](https://github.com/ShishirPatil/gorilla/pull/386): Add 5 new models to the leaderboard: `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `gemini-1.5-pro-preview-0409`, `command-r-plus`, `command-r-plus-FC`. -* [April 19, 2024] [#377](https://github.com/ShishirPatil/gorilla/pull/377): - - Bug fix for the evaluation dataset in the executable test categories. This includes updates to both prompts and function docs. - - The `evaluation_result` field has been removed to accommodate the variability in API execution results across different evaluation runs. Instead, a human-verified `ground_truth` is now included for the executable test categories. During each evaluation run, `evaluation_result` is generated anew using the `ground_truth`, and then compared against the model output. - - A stricter metric has been adopted when using the `structural_match` (aka. type match) evaluation criteria ---- For `list` results, the lengths are compared; for `dict` results, the keys are matched. This is to account for the fast-changing nature of some of the real-time API results while ensuring the evaluation remains meaningful. - - Added another evaluation criteria `real_time_match` for the executable category, which is a looser form of `exact_match` specifically for numerical execution results. The execution result must be within a certain percentage threshold (20%) from the expected result to accommodate the live updates of API responses. User can change this threshold value in `eval_checker_constant.py`. -* [April 18, 2024] [#375](https://github.com/ShishirPatil/gorilla/pull/375): A more comprehensive API sanity check is included; the APIs that are invoked during the non-REST executable evaluation process will also be checked for their availability before running the evaluation. Also, add support for the shortcut `-s` for the `--skip-api-sanity-check` flag, based on the community feedback. -* [April 16, 2024] [#366](https://github.com/ShishirPatil/gorilla/pull/366): Switch to use Anthropic's new Tool Use Beta `tools-2024-04-04` when generating Claude 3 FC series data. `gpt-4-turbo-2024-04-09` and `gpt-4-turbo-2024-04-09-FC` are also added to the leaderboard. -* [April 11, 2024] [#347](https://github.com/ShishirPatil/gorilla/pull/347): Add the 95th percentile latency to the leaderboard statistics. This metric is useful for understanding the latency distribution of the models, especially the worst-case scenario. -* [April 10, 2024] [#339](https://github.com/ShishirPatil/gorilla/pull/339): Introduce REST API sanity check for the REST executable test category. It ensures that all the API endpoints involved during the execution evaluation process are working properly. If any of them are not behaving as expected, the evaluation process will be stopped by default as the result will be inaccurate. Users can choose to bypass this check by setting the `--skip-api-sanity-check` flag or `-s` for short. -* [April 9, 2024] [#338](https://github.com/ShishirPatil/gorilla/pull/338): Bug fix in the evaluation datasets (including both prompts and function docs). Bug fix for possible answers as well. -* [April 8, 2024] [#330](https://github.com/ShishirPatil/gorilla/pull/330): Fixed an oversight that was introduced in [#299](https://github.com/ShishirPatil/gorilla/pull/299). For function-calling (FC) models that cannot take `float` type in input, when the parameter type is a `float`, the evaluation procedure will convert that type to `number` in the model input and mention in the parameter description that `This is a float type value.`. An additional field `format: float` will also be included in the model input to make it clear about the type. Updated the model handler for Claude, Mistral, and OSS to better parse the model output. -* [April 8, 2024] [#327](https://github.com/ShishirPatil/gorilla/pull/327): Add new model `NousResearch/Hermes-2-Pro-Mistral-7B` to the leaderboard. -* [April 3, 2024] [#309](https://github.com/ShishirPatil/gorilla/pull/309): Bug fix for evaluation dataset possible answers. Implement **string standardization** for the AST evaluation pipeline, i.e. removing white spaces and a subset of punctuations (`,./-_*^`) to make the AST evaluation more robust and accurate. Fixed AST evaluation issue for type `tuple`. Add 2 new models `meetkai/functionary-small-v2.4 (FC)`, `meetkai/functionary-medium-v2.4 (FC)` to the leaderboard. -* [April 1, 2024] [#299](https://github.com/ShishirPatil/gorilla/pull/299): Leaderboard update with new models (`Claude-3-Haiku`, `Databrick-DBRX-Instruct`), more advanced AST evaluation procedure, and updated evaluation datasets. Cost and latency statistics during evaluation are also measured. We also released the manual that our evaluation procedure is based on, available [here](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#metrics). -* [Mar 11, 2024] [#254](https://github.com/ShishirPatil/gorilla/pull/254): Leaderboard update with 3 new models: `Claude-3-Opus-20240229 (Prompt)`, `Claude-3-Sonnet-20240229 (Prompt)`, and `meetkai/functionary-medium-v2.2 (FC)` -* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `gemma`. -* [Feb 29, 2024] [#223](https://github.com/ShishirPatil/gorilla/pull/223): modifications to REST evaluation. - +For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure workspace and setup an endpoint for inference. ## Contributing @@ -296,8 +231,6 @@ To add a new model to the Function Calling Leaderboard, here are a few things yo 5. Raise a [Pull Request](https://github.com/ShishirPatil/gorilla/pulls) with your new Model Handler. We will run the model handler if an endpoint is established. If self-hosting is required and the model size is large, we might not be able to accommodate model hosting therefore an OpenAI compatible endpoint for evaluation is desired. 6. Feel Free to join [Gorilla Discord](https://discord.gg/grXXvj9Whz) `#leaderboard` and reach out to us for any questions or concerns about adding new models. We are happy to help you! - All the leaderboard statistics, and data used to train the models are released under Apache 2.0. Gorilla is an open source effort from UC Berkeley and we welcome contributors. Please email us your comments, criticisms, and questions. More information about the project can be found at [https://gorilla.cs.berkeley.edu/](https://gorilla.cs.berkeley.edu/) - diff --git a/berkeley-function-call-leaderboard/_README.md b/berkeley-function-call-leaderboard/_README.md new file mode 100644 index 000000000..f9209f480 --- /dev/null +++ b/berkeley-function-call-leaderboard/_README.md @@ -0,0 +1,177 @@ +
+

Berkeley Function Calling Leaderboard (BFCL)

+ +

+ 🤗 Dataset • + 🏆 Leaderboard • + 📰 Blog +

+ +
+ + +## Introduction +We present Berkeley Function Leaderboard, the **first comprehensive and executable function calling evaluation for LLMs function calling**. Different from prior function calling evaluations (e.g. Anyscale function calling blog), we consider function callings of various forms, different function calling scenarios, and the executability of function calls. We also release our model [gorilla-openfunctions-v2](https://huggingface.co/gorilla-llm/gorilla-openfunctions-v2), the best open-source models so far to handle multiple languages of function calls, parallel function calls and multiple function calls. We also provide a specific debugging feature that when the provided function is not suitable for your task, the model will output an “Error Message”. + +Read more about the technical details and interesting insights in our blog post! + +![image](./architecture_diagram.png) + +## Get started + +Create a `.env` file similar to the [.env.example](.env.example) file, and fill out the values for the variables you wish to use for either open-source or proprietary LLM generation and evaluation. + +### 🚀 Installation + +> [!Tip] +> Ensure that you are using the latest versions of `setuptools`, `wheel`, and `pip` to avoid any installation issues. Run: +> ```bash +> pip install --upgrade setuptools wheel pip +> ``` + +To install the `bfcl` package from the GitHub repository, run: +```bash +$ git clone https://github.com/ShishirPatil/gorilla +$ cd berkeley-function-call-leaderboard +$ pip install -e . +``` + +Extras dependencies can be installed via: +```bash +pip install -e ".[NAME]" +``` +| Name | Use | +|-------------------|----------------------------------------------------------| +| oss_eval | For LLM generation and evaluation using open source models | +| proprietary_eval | For LLM generation and evaluation using proprietary models | +| all | Loads all extras (not recommended) | + +#### OSS eval + +We use [vllm](https://docs.vllm.ai/en/latest/index.html) to perform offline LLM inference. Installation of [vllm](https://docs.vllm.ai/en/latest/getting_started/installation.html#requirements) requires installing a CUDA-compatible PyTorch version. You can run the following command: +```bash +# Replace the CUDA version "cu118" according to your system. +# See available CUDA versions at https://pytorch.org/get-started/locally/ +# bfcl currently uses `v0.5.1` of vllm and it requires torch `v2.3.0` +$ pip install torch==2.3.0 --index-url https://download.pytorch.org/whl/cu118 +$ pip install -e ".[oss_eval]" +``` + +#### Proprietary eval + +To install dependencies for proprietary model evaluation, run: +```bash +pip install -e ".[proprietary_eval]" +``` + +## User Guide + +A comprehensive user guide detailing the full list of supported arguments is available [here](./bfcl/cli.py) and can also be accessed on the terminal by calling: +```bash +bfcl -h +``` +```text +usage: bfcl [-h] {llm_generation,evaluation} ... + +Berkeley Function Calling Leaderboard (BFCL) + +positional arguments: + {llm_generation,evaluation} + Sub-command to run + llm_generation Collect LLM responses + evaluation Run evaluation + +options: + -h, --help show this help message and exit +``` + +### LLM Generation + +To view the full list of arguments for the LLM generation sub-command, call: +```bash +bfcl llm_generation -h +``` + +#### Open Source Models + +To perform generation on an open-weights model (e.g. [google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)) for the `ast` test group, use the following command: +```bash +bfcl llm_generation \ + --model google/gemma-7b-it \ + --model_type oss \ + --test-group ast +``` + +To provide sampling parameters, you can use: +```bash +bfcl llm_generation \ + --model google/gemma-7b-it \ + --model_type oss \ + --test-group ast \ + --temperature 0.7 \ + --top-p 1 \ + --max-tokens 1000 +``` + +To specify multiple test categories: +```bash +bfcl llm_generation \ + --model google/gemma-7b-it \ + --model_type oss \ + --test-categories rest,java,javascript +``` + +#### Proprietary Models + +To perform generation on a proprietary/hosted model (e.g. [gorilla-openfunctions-v2](https://huggingface.co/gorilla-llm/gorilla-openfunctions-v2)) for the `executable` test group, use: +```bash +bfcl llm_generation \ + --model gorilla-openfunctions-v2 \ + --model_type proprietary \ + --test-group executable +``` + +To specify multiple test categories: +```bash +bfcl llm_generation \ + --model gorilla-openfunctions-v2 \ + --model_type proprietary \ + --test-categories relevance,multiple_function,parallel_function +``` + +### Evaluation + +To view the full list of arguments for the evaluation sub-command, call: +```bash +bfcl evaluation -h +``` + +To perform evaluation of a proprietary/hosted model (e.g. [gorilla-openfunctions-v2](https://huggingface.co/gorilla-llm/gorilla-openfunctions-v2)) on all the test categories, use: +```bash +bfcl evaluation \ + --model gorilla-openfunctions-v2 \ + --model_type proprietary \ + --test-group all +``` + +#### Executable Test Category Evaluation + +To run the executable test categories, you need to provide the following API keys in the `.env` file: +```ini +RAPID_API_KEY= +EXCHANGERATE_API_KEY= +OMDB_API_KEY= +GEOCODE_API_KEY= +``` +You can use the following links to obtain the API keys: +1. Rapid API: https://rapidapi.com/hub + * Yahoo Finance: https://rapidapi.com/sparior/api/yahoo-finance15 + * Real Time Amazon Data : https://rapidapi.com/letscrape-6bRBa3QguO5/api/real-time-amazon-data + * Urban Dictionary: https://rapidapi.com/community/api/urban-dictionary + * Covid 19: https://rapidapi.com/api-sports/api/covid-193 + * Time zone by Location: https://rapidapi.com/BertoldVdb/api/timezone-by-location + + All the Rapid APIs we use have free tier usage. As a result, you need to subscribe to those API providers in order to have the executable test environment setup but it will be free of charge! +2. ExchangeRate API: https://www.exchangerate-api.com +3. OMDB API: http://www.omdbapi.com/apikey.aspx +4. Geocode API: https://geocode.maps.co/ \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/apply_function_credential_config.py b/berkeley-function-call-leaderboard/apply_function_credential_config.py deleted file mode 100644 index 7b6124896..000000000 --- a/berkeley-function-call-leaderboard/apply_function_credential_config.py +++ /dev/null @@ -1,77 +0,0 @@ -import json -import argparse - - -parser = argparse.ArgumentParser(description="Replace placeholders in the function credential config file.") -parser.add_argument("--input-file", help="Path to the function credential config file.", required=True) -parser.add_argument("--output-file", help="Path to the output file.", default="") -args = parser.parse_args() - -# Load the configuration with actual API keys -with open("function_credential_config.json") as f: - function_credential_config = json.load(f) - -PLACEHOLDERS = { - "YOUR-GEOCODE-API-KEY": function_credential_config[3]["GEOCODE-API-KEY"], - "YOUR-RAPID-API-KEY": function_credential_config[0]["RAPID-API-KEY"], - "YOUR-OMDB-API-KEY": function_credential_config[2]["OMDB-API-KEY"], - "YOUR-EXCHANGERATE-API-KEY": function_credential_config[1]["EXCHANGERATE-API-KEY"] -} - - -def replace_placeholders(data): - """ - Recursively replace placeholders in a nested dictionary or list using string.replace. - """ - if isinstance(data, dict): - for key, value in data.items(): - if isinstance(value, (dict, list)): - replace_placeholders(value) - elif isinstance(value, str): - for placeholder, actual_value in PLACEHOLDERS.items(): - if placeholder in value: # Check if placeholder is in the string - data[key] = value.replace(placeholder, actual_value) - elif isinstance(data, list): - for idx, item in enumerate(data): - if isinstance(item, (dict, list)): - replace_placeholders(item) - elif isinstance(item, str): - for placeholder, actual_value in PLACEHOLDERS.items(): - if placeholder in item: # Check if placeholder is in the string - data[idx] = item.replace(placeholder, actual_value) - return data - -def main(): - # Verify all values are provided - for key, value in PLACEHOLDERS.items(): - if value == "": - print(f"Please provide a value for the placeholder {key}.") - return - print("All API keys are present.") - - modified_data = [] - with open(f"{args.input_file}", 'r') as f: - lines = f.readlines() - for line in lines: - try: - data = json.loads(line) # Parse each line as a JSON object - data = replace_placeholders(data) # Replace placeholders - modified_data.append(json.dumps(data)) # Convert back to string and store - except json.JSONDecodeError: - # Handle the case where a line is not a valid JSON object - print("Invalid JSON line skipped.") - continue - - if args.output_file == "": - with open(f"{args.input_file}", 'w') as f: - for modified_line in modified_data: - f.write(modified_line + '\n') # Write each modified JSON object back to the input file - print(f"All placeholders have been replaced in {args.input_file} 🦍.") - else: - with open(f"{args.output_file}", 'w') as f: - for modified_line in modified_data: - f.write(modified_line + '\n') # Write each modified JSON object overwrite the output file - print(f"All placeholders have been replaced in {args.output_file} 🦍.") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/__init__.py b/berkeley-function-call-leaderboard/bfcl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/berkeley-function-call-leaderboard/bfcl/cli.py b/berkeley-function-call-leaderboard/bfcl/cli.py new file mode 100644 index 000000000..5dca55708 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/cli.py @@ -0,0 +1,148 @@ +import os +import argparse + +from dotenv import load_dotenv + +from bfcl.evaluation import evaluate +from bfcl.llm_generation import collect_model_responses +from bfcl.model_handler.base import BaseHandler +from bfcl.types import (LeaderboardCategory, Leaderboard, LeaderboardVersion, + ModelType, LeaderboardCategoryGroup) + +load_dotenv() + + +def main(): + args = _get_args() + leaderboard = _load_leaderboard(args) + model_handler = _load_model_handler(args) + + if args.command == 'llm_generation': + collect_model_responses(leaderboard, model_handler, args) + else: + evaluate(leaderboard, model_handler, args) + + +def _get_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog='bfcl', + description='Berkeley Function Calling Leaderboard (BFCL)' + ) + + subparsers = parser.add_subparsers(dest='command', required=True, help='Sub-command to run') + + # Common arguments for both benchmark and evaluation + common_parser = argparse.ArgumentParser(add_help=False) + common_parser.add_argument( + '--model', + type=str, + default='gorilla-openfunctions-v2', + help="Name of the LLM. (default: 'gorilla-openfunctions-v2')" + ) + common_parser.add_argument( + '--model-type', + type=ModelType, + choices=[mtype.value for mtype in ModelType], + default=ModelType.PROPRIETARY.value, + help="Model type: Open-source or Proprietary (default: 'proprietary')" + ) + common_parser.add_argument( + '--test-group', + type=LeaderboardCategoryGroup, + choices=[group.value for group in LeaderboardCategoryGroup], + default=None, + help='Test category group (default: None)' + ) + common_parser.add_argument( + '--test-categories', + type=str, + default=None, + help=( + 'Comma-separated list of test categories ' + f"({','.join(cat.value for cat in LeaderboardCategory)}). " + "(default: None)" + ) + ) + common_parser.add_argument( + '--version', + type=LeaderboardVersion, + default=LeaderboardVersion.V1.value, + choices=[category.value for category in LeaderboardVersion], + help="Leaderboard version. (default: 'v1')", + ) + + _add_llm_generation_args(subparsers, common_parser) + _add_evaluation_args(subparsers, common_parser) + + args = parser.parse_args() + return args + + +def _add_llm_generation_args(subparsers, common_parser): + """Add LLM generation specific arguments.""" + + benchmark_parser = subparsers.add_parser('llm_generation', parents=[common_parser], help='Collect LLM responses') + benchmark_parser.add_argument('--temperature', type=float, default=0.7, help='Temperature (default: 0.7)') + benchmark_parser.add_argument('--top-p', type=float, default=1, help='Top-p (default: 1)') + benchmark_parser.add_argument('--max-tokens', type=int, default=1000, help='Max tokens (default: 1000)') + benchmark_parser.add_argument('--num-gpus', default=1, type=int, help='No. of GPUs (default: 1)') + benchmark_parser.add_argument('--timeout', default=60, type=int, help='Timeout (default: 60)') + + +def _add_evaluation_args(subparsers, common_parser): + """Add evaluation-specific arguments.""" + + evaluator_parser = subparsers.add_parser('evaluation', parents=[common_parser], help='Run evaluation') + evaluator_parser.add_argument( + '--perform-api-sanity-check', + action='store_true', + default=False, + help='Perform the REST API status sanity check before running the evaluation. (default: False)', + ) + + +def _load_leaderboard(args: argparse.Namespace) -> Leaderboard: + if args.test_categories: + categories = [] + for value in args.test_categories.split(','): + if value not in LeaderboardCategory._value2member_map_: + raise ValueError(f'Invalid test category: "{value}"!') + categories.append(LeaderboardCategory(value)) + args.test_categories = categories + return Leaderboard( + test_group=args.test_group, + test_categories=args.test_categories, + version=args.version + ) + + +def _load_model_handler(args: argparse.Namespace) -> BaseHandler: + if args.model_type == ModelType.OSS: + from bfcl.model_handler.oss_model import MODEL_TO_HANDLER_CLS + elif args.model_type == ModelType.PROPRIETARY: + from bfcl.model_handler.proprietary_model import MODEL_TO_HANDLER_CLS + + if os.getenv('USE_COHERE_OPTIMIZATION') and 'command-r-plus' in args.model: + args.model += '-optimized' + + assert (handler_cls := MODEL_TO_HANDLER_CLS.get(args.model)), ( + f'Invalid model name "{args.model}"! Please select a {args.model_type.value} ' + f'model from {tuple(MODEL_TO_HANDLER_CLS)}' + ) + + # This model handler function is shared by `benchmark` and `evaluate` functions + # `evaluate` cli args doesn't required temperature, top_p and max_tokens, + # since for evaluation we won't be calling the inference method. + if hasattr(args, 'temperature'): + return handler_cls( + model_name=args.model, + temperature=args.temperature, + top_p=args.top_p, + max_tokens=args.max_tokens, + ) + else: + return handler_cls(model_name=args.model) + + +if __name__ == "__main__": + main() diff --git a/berkeley-function-call-leaderboard/bfcl/evaluation.py b/berkeley-function-call-leaderboard/bfcl/evaluation.py new file mode 100644 index 000000000..c315a1fab --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluation.py @@ -0,0 +1,36 @@ +import argparse +from pathlib import Path + +from bfcl.evaluator import LeaderboardEvaluator +from bfcl.types import Leaderboard, LeaderboardCategory +from bfcl.model_handler.base import BaseHandler + + +def evaluate( + leaderboard: Leaderboard, + model_handler: BaseHandler, + args: argparse.Namespace +) -> None: + + print('🦍 Model:', args.model) + evaluator = LeaderboardEvaluator( + model_handler=model_handler, + leaderboard=leaderboard, + perform_api_sanity_check=args.perform_api_sanity_check + ) + file_name_to_test_category = {} + for test_category in leaderboard.test_categories: + if test_category.value in (LeaderboardCategory.SQL.value, LeaderboardCategory.CHATABLE.value): + print(f'Evaluation for test category "{test_category.value}" is not currently supported!') + else: + file_name = leaderboard.get_file_name(test_category) + file_name_to_test_category[Path(file_name).stem] = test_category + + for file_path in model_handler.model_dir.glob('*.jsonl'): + test_category = file_name_to_test_category.get(file_path.stem.replace('_result', '')) + if test_category is None: + continue + evaluator(file_path, test_category) + + evaluator.generate_leaderboard_csv() + print('🏁 Evaluation completed.') \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/__init__.py new file mode 100644 index 000000000..f4a3e1fdf --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/__init__.py @@ -0,0 +1,5 @@ +from .evaluator import LeaderboardEvaluator + +__all__ = [ + 'LeaderboardEvaluator' +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py new file mode 100644 index 000000000..c27828ec8 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py @@ -0,0 +1,7 @@ +from .executable import ExecutableChecker +from .ast import AstChecker + +__all__ = [ + 'ExecutableChecker', + 'AstChecker', +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/__init__.py new file mode 100644 index 000000000..04ec14c4a --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/__init__.py @@ -0,0 +1,3 @@ +from .ast import AstChecker + +__all__ = ['AstChecker'] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py new file mode 100644 index 000000000..dbf257666 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py @@ -0,0 +1,551 @@ +import re +import json +from typing import List, Dict +from pathlib import Path + +from bfcl.evaluator.checker.types import CheckerResult +from bfcl.types import LeaderboardAstCategory, Leaderboard +from bfcl.model_handler import constants +from bfcl.evaluator.checker.ast import type_converter, utils + + +class AstChecker: + NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"] + PYTHON_TYPE_MAPPING = { + "string": str, + "integer": int, + "float": float, + "boolean": bool, + "array": list, + "tuple": list, + "dict": dict, + "any": str, + } + # This is the list of types that we need to recursively check its values + PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"] + + def __init__(self, model_name: str, leaderboard: Leaderboard) -> None: + self.model_name = model_name + self.leaderboard = leaderboard + self.possible_ans_dir = Path(__file__, '../../../../..').resolve() / 'data/possible_answer' + self.test_category_to_possible_ans = {} + + def load_possible_answers(self, test_category: LeaderboardAstCategory) -> None: + if test_category not in self.test_category_to_possible_ans: + file_name = self.leaderboard.get_file_name(test_category) + with open(self.possible_ans_dir / file_name, 'r') as file: + self.test_category_to_possible_ans[test_category] = [json.loads(line) for line in file] + + def __call__( + self, + idx: int, + func_description, + model_output: List, + test_category: LeaderboardAstCategory, + ) -> CheckerResult: + + language = self.get_language(test_category) + self.load_possible_answers(test_category) + possible_answers = self.test_category_to_possible_ans[test_category][idx] + + if 'multiple' in test_category.value or 'parallel' in test_category.value: + # Some formatting issues that needs to be handled + if test_category == "parallel_function": + func_description = [func_description] + return self._parallel_function_no_order_checker( + func_description, + model_output, + possible_answers, + language, + ) + else: + if len(model_output) != 1: + return CheckerResult( + is_valid=False, + error_type="simple_function_checker:wrong_count", + error_message="Wrong number of functions." + ) + model_output = model_output[0] + return self._simple_function_checker( + func_description, + model_output, + possible_answers, + language, + ) + + def _parallel_function_no_order_checker( + self, + func_descriptions: List, + model_output: List, + possible_answers: Dict, + language: str, + ) -> CheckerResult: + + if len(model_output) != len(possible_answers): + return CheckerResult( + is_valid=False, + error_type='parallel_function_checker_no_order:wrong_count', + error_message='Wrong number of functions.' + ) + + func_name_list = list(possible_answers.keys()) + possible_answers_list = [{key: value} for key, value in possible_answers.items()] + matched_indices = [] + # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer. + # It must be this way because we need ground truth to fetch the correct function description. + for i in range(len(possible_answers_list)): + func_description = utils.find_description(func_descriptions, func_name_list[i]) + # This should not happen. As possible_answers is the ground truth, and it should have the correct function name. + if func_description is None: + return CheckerResult( + is_valid=False, + error_type='parallel_function_checker_no_order:cannot_find_description', + error_message=f"Function doc description not found for function name: {repr(func_name_list[i])}." + ) + + all_errors = [] + for index in range(len(model_output)): + if index in matched_indices: + continue + + result = self._simple_function_checker( + func_description, + model_output[index], + possible_answers_list[i], + language, + ) + if result.is_valid: + matched_indices.append(index) + break + else: + all_errors.append( + { + f"Model Result Index {index}": { + "sub_error": result.error_message, + "sub_error_type": result.error_type, + "model_output_item": model_output[index], + "possible_answer_item": possible_answers_list[i], + } + } + ) + + if not result.is_valid: + considered_indices = [i for i in range(len(model_output)) if i not in matched_indices] + error_message = ( + f"Could not find a matching function among index {considered_indices} of model " + f"output for index {i} of possible answers." + ) + error_message += "\nErrors:\n" + '\n'.join(map(json.dumps, all_errors)) + return CheckerResult( + is_valid=False, + error_type="parallel_function_checker_no_order:cannot_find_match", + error_message=error_message + ) + + return CheckerResult(is_valid=True, error_type='', error_message='') + + def _simple_function_checker( + self, + func_description: dict, + model_output: dict, + possible_answer: dict, + language: str, + ) -> CheckerResult: + + language = language.lower() + possible_answer = list(possible_answer.values())[0] + # Extract function name and parameters details + func_name = func_description["name"] + param_details = func_description["parameters"]["properties"] + required_params = func_description["parameters"]["required"] + + result = CheckerResult(is_valid=True, error_type="simple_function_checker:unclear", error_message="") + func_name = utils.convert_func_name(func_name, self.model_name) + # Check if function name matches + if func_name not in model_output: + return CheckerResult( + is_valid=False, + error_type="simple_function_checker:wrong_func_name", + error_message=f"Function name {repr(func_name)} not found in model output." + ) + + model_params = model_output[func_name] + # Check for required parameters in model output + for param in required_params: + if param not in model_params: + return CheckerResult( + is_valid=False, + error_type="simple_function_checker:missing_required", + error_message=f"Missing required parameter: {repr(param)}." + ) + + # Validate types and values for each parameter in model output + for param, value in model_params.items(): + if param not in param_details or param not in possible_answer: + return CheckerResult( + is_valid=False, + error_type="simple_function_checker:unexpected_param", + error_message=f"Unexpected parameter: {repr(param)}." + ) + + full_param_details = param_details[param] + expected_type_description = full_param_details["type"] # This is a string + is_variable = False + nested_type_converted = None + + if language == "java": + expected_type_converted = constants.JAVA_TYPE_CONVERSION[expected_type_description] + if expected_type_description in constants.JAVA_TYPE_CONVERSION: + if not isinstance(value, str): + return CheckerResult( + is_valid=False, + error_type="type_error:java", + error_message=f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + + if expected_type_description in self.NESTED_CONVERSION_TYPE_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = constants.JAVA_TYPE_CONVERSION[nested_type] + value = type_converter.java.java_type_converter(value, expected_type_description, nested_type) + else: + value = type_converter.java.java_type_converter(value, expected_type_description) + elif language == "javascript": + expected_type_converted = constants.JS_TYPE_CONVERSION[expected_type_description] + if expected_type_description in constants.JS_TYPE_CONVERSION: + if not isinstance(value, str): + return CheckerResult( + is_valid=False, + error_type="type_error:js", + error_message=f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + + if expected_type_description in self.NESTED_CONVERSION_TYPE_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = constants.JS_TYPE_CONVERSION[nested_type] + value = type_converter.javascript.js_type_converter(value, expected_type_description, nested_type) + else: + value = type_converter.javascript.js_type_converter(value, expected_type_description) + elif language == "python": + expected_type_converted = self.PYTHON_TYPE_MAPPING[expected_type_description] + if expected_type_description in self.PYTHON_NESTED_TYPE_CHECK_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = self.PYTHON_TYPE_MAPPING[nested_type] + + # We convert all tuple value to list when the expected type is tuple. + # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load(). + # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future. + if expected_type_description == "tuple" and type(value) == tuple: + value = list(value) + + # Allow python auto conversion from int to float + if ( + language == "python" + and expected_type_description == "float" + and type(value) == int + ): + value = float(value) + + # Type checking + # In fact, we only check for Python here. + # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct. + type_check_result = AstChecker.type_checker( + param, + value, + possible_answer[param], + expected_type_description, + expected_type_converted, + nested_type_converted, + ) + if not type_check_result.is_valid: + return type_check_result + is_variable = type_check_result.is_variable + + # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable. + # We can just treat the variable as a string and use the normal flow. + if not is_variable: + # Special handle for dictionaries + if expected_type_converted == dict: + result = AstChecker.dict_checker(param, value, possible_answer[param]) + if not result.is_valid: + return result + continue + + # Special handle for list of dictionaries + elif expected_type_converted == list and nested_type_converted == dict: + result = AstChecker.list_dict_checker(param, value, possible_answer[param]) + if not result.is_valid: + return result + continue + + # Special handle for strings + elif expected_type_converted == str: + # We don't check for case sensitivity for string, as long as it's not a variable + result = AstChecker.string_checker(param, value, possible_answer[param]) + if not result.is_valid: + return result + continue + + elif expected_type_converted == list: + result = AstChecker.list_checker(param, value, possible_answer[param]) + if not result.is_valid: + return result + continue + + # Check if the value is within the possible answers + if value not in possible_answer[param]: + result.is_valid = False + result.error_message = ( + f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}." + ) + result.error_type = "value_error:others" + return result + + # Check for optional parameters not provided but allowed + for param in possible_answer: + if param not in model_params and "" not in possible_answer[param]: + result.is_valid = False + result.error_message = f"Optional parameter {repr(param)} not provided and not marked as optional." + result.error_type = "simple_function_checker:missing_optional" + return result + + return result + + @staticmethod + def type_checker( + param: str, + value, + possible_answer: List, + expected_type_description: str, + expected_type_converted, + nested_type_converted, + ) -> CheckerResult: + # NOTE: This type checker only supports nested type checking for one level deep. + # We didn't implement recursive type checking for nested types, as it's not needed for + # the current use case and it's very complex. + + result = CheckerResult( + is_valid=True, + error_type="type_error:simple", + error_message='', + is_variable=True + ) + is_variable = False + # check for the case where a variable is used instead of a actual value. + # use the type in possible_answer as the expected type + possible_answer_type = utils.get_possible_answer_type(possible_answer) + # if possible_answer only contains optional parameters, we can't determine the type + if possible_answer_type != None: + # we are being precise here. + # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer + if possible_answer_type != expected_type_converted: + is_variable = True + + # value is the same type as in function description + if type(value) == expected_type_converted: + # We don't need to do recursive check for simple types + if nested_type_converted == None: + result.is_variable = is_variable + return result + else: + for possible_answer_item in possible_answer: + flag = True # Each parameter should match to at least one possible answer type. + # Here, we assume that each item should be the same type. We could also relax it. + if type(possible_answer_item) == list: + for value_item in value: + checker_result = AstChecker.type_checker( + param, + value_item, + possible_answer_item, + str(nested_type_converted), + nested_type_converted, + None, + ) + if not checker_result.is_valid: + flag = False + break + + if flag: + return CheckerResult( + is_valid=True, + error_type='', + error_message='', + is_variable=is_variable + ) + + result.is_valid = False + result.error_type = "type_error:nested" + result.error_message = ( + f"Nested type checking failed for parameter {repr(param)}. " + f'Expected outer type {expected_type_description} with inner type ' + f'{str(nested_type_converted)}. Parameter value: {repr(value)}.' + ) + + # value is not as expected, check for the case where a variable is used instead of a actual value + # use the type in possible_answer as the expected type + possible_answer_type = utils.get_possible_answer_type(possible_answer) + # if possible_answer only contains optional parameters, we can't determine the type + if possible_answer_type is not None: + # we are being precise here. + # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer + if type(value) == possible_answer_type: + result.is_variable = True + return result + + return CheckerResult( + is_valid=False, + error_type='type_error:simple', + error_message=f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + + def string_checker(param: str, model_output: str, possible_answer: List) -> CheckerResult: + standardize_possible_answer = [] + standardize_model_output = AstChecker.standardize_string(model_output) + for i in range(len(possible_answer)): + if type(possible_answer[i]) == str: + standardize_possible_answer.append(AstChecker.standardize_string(possible_answer[i])) + + if standardize_model_output not in standardize_possible_answer: + return CheckerResult( + is_valid=False, + error_type="value_error:string", + error_message=f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive." + ) + + return CheckerResult(is_valid=True, error_type='', error_message='',) + + @staticmethod + def list_checker(param: str, model_output: List, possible_answer: List) -> CheckerResult: + # Convert the tuple to a list + standardize_model_output = list(model_output) + + # If the element in the list is a string, we need to standardize it + for i in range(len(standardize_model_output)): + if type(standardize_model_output[i]) == str: + standardize_model_output[i] = AstChecker.standardize_string(model_output[i]) + + standardize_possible_answer = [] + # We also need to standardize the possible answers + for i in range(len(possible_answer)): + standardize_possible_answer.append([]) + for j in range(len(possible_answer[i])): + if type(possible_answer[i][j]) == str: + standardize_possible_answer[i].append(AstChecker.standardize_string(possible_answer[i][j])) + else: + standardize_possible_answer[i].append(possible_answer[i][j]) + + if standardize_model_output not in standardize_possible_answer: + return CheckerResult( + is_valid=False, + error_type="value_error:list/tuple", + error_message=f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}." + ) + + return CheckerResult(is_valid=True, error_type='', error_message='') + + @staticmethod + def dict_checker(param: str, model_output: Dict, possible_answers: List) -> CheckerResult: + # This function works for simple dictionaries, as well as dictionaries with nested dictionaries + result = CheckerResult(is_valid=False, error_type='dict_checker:unclear', error_message='') + for i in range(len(possible_answers)): + if possible_answers[i] == "": + continue + + result = CheckerResult(is_valid=False, error_type='dict_checker:unclear', error_message='') + flag = True + possible_answer = possible_answers[i] + # possible_answer is a single dictionary + if len(model_output.keys()) != len(possible_answer.keys()): + result.is_valid = False + result.error_message = "Wrong number of parameters for dictionary." + result.error_type = "value_error:dict_items" + flag = False + continue + + for key, value in model_output.items(): + if key not in possible_answer: + result.is_valid = False + result.error_message = f"Unexpected parameter: '{key}'." + result.error_type = "value_error:dict_key" + flag = False + break + + expected_values = possible_answer[key] + if isinstance(expected_values, dict): + result = AstChecker.dict_checker(param, value, [expected_values]) + if not result.is_valid: + flag = False + break + else: + standardize_value = value + # If the value is a string, we need to standardize it + if type(value) == str: + standardize_value = AstChecker.standardize_string(value) + # We also need to standardize the possible answers + standardize_possible_answer = [] + for i in range(len(possible_answer[key])): + if type(possible_answer[key][i]) == str: + standardize_possible_answer.append( + AstChecker.standardize_string(possible_answer[key][i]) + ) + else: + standardize_possible_answer.append(possible_answer[key][i]) + + if standardize_value not in standardize_possible_answer: + result.is_valid = False + result.error_message = f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}." + result.error_type = "value_error:dict_value" + flag = False + break + if flag: + return CheckerResult(is_valid=True, error_type='', error_message='') + + return result + + @staticmethod + def list_dict_checker(param: str, model_output: List, possible_answers: List) -> CheckerResult: + # This function takes in a list of dictionaries and checks if each dictionary is valid + # The order of the dictionaries in the list must match the order of the possible answers + result = CheckerResult(is_valid=False, error_type='list_dict_checker:unclear', error_message='') + for answer_index in range(len(possible_answers)): + flag = True # True means so far, all dictionaries are valid + + # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers + if len(model_output) != len(possible_answers[answer_index]): + result.is_valid = False + result.error_message = "Wrong number of dictionaries in the list." + result.error_type = "value_error:list_dict_count" + flag = False + continue + + for dict_index in range(len(model_output)): + result = AstChecker.dict_checker( + param, + model_output[dict_index], + [possible_answers[answer_index][dict_index]], + ) + if not result.is_valid: + flag = False + break + if flag: + return CheckerResult(is_valid=True, error_type='', error_message='') + + return result + + @staticmethod + def standardize_string(input_string: str): + # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase + # It will also convert all the single quotes to double quotes + # This is used to compare the model output with the possible answers + # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024 + regex_string = r"[ \,\.\/\-\_\*\^]" + return re.sub(regex_string, "", input_string).lower().replace("'", '"') + + @staticmethod + def get_language(test_category: LeaderboardAstCategory) -> str: + if test_category.value == LeaderboardAstCategory.JAVA.value: + language = 'java' + elif test_category.value == LeaderboardAstCategory.JAVASCRIPT.value: + language = 'javascript' + else: + language = 'python' + return language \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/__init__.py new file mode 100644 index 000000000..a8cecd5cf --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/__init__.py @@ -0,0 +1,4 @@ +from . import java +from . import javascript + +__all__ = ['java', 'javascript'] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/eval_checker/java_type_converter.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/java.py similarity index 99% rename from berkeley-function-call-leaderboard/eval_checker/java_type_converter.py rename to berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/java.py index 973aaa0e2..ab16ac310 100644 --- a/berkeley-function-call-leaderboard/eval_checker/java_type_converter.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/java.py @@ -1,6 +1,7 @@ import re from typing import List, Dict, Union -from model_handler.constant import JAVA_TYPE_CONVERSION + +from bfcl.model_handler.constants import JAVA_TYPE_CONVERSION def java_type_converter(value, expected_type, nested_type=None): diff --git a/berkeley-function-call-leaderboard/eval_checker/js_type_converter.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/javascript.py similarity index 99% rename from berkeley-function-call-leaderboard/eval_checker/js_type_converter.py rename to berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/javascript.py index 93a4de6c0..d85fc0a42 100644 --- a/berkeley-function-call-leaderboard/eval_checker/js_type_converter.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/javascript.py @@ -1,5 +1,6 @@ import re -from model_handler.constant import JS_TYPE_CONVERSION + +from bfcl.model_handler.constants import JS_TYPE_CONVERSION def js_type_converter(value, expected_type, nested_type=None): diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/utils.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/utils.py new file mode 100644 index 000000000..15b703dfa --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/utils.py @@ -0,0 +1,31 @@ +import re + +from bfcl.model_handler.constants import UNDERSCORE_TO_DOT + + +def convert_func_name(function_name, model_name: str): + model_name_escaped = model_name.replace("_", "/") + if "." in function_name: + if model_name_escaped in UNDERSCORE_TO_DOT: + # OAI does not support "." in the function name so we replace it with "_". + # ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name. + # This happens for OpenAI, Mistral, and Google models + return re.sub(r"\.", "_", function_name) + return function_name + +def find_description(func_descriptions, name): + # If func_descriptions is a list, this is the multiple or multiple_parallel case + if type(func_descriptions) == list: + for func_description in func_descriptions: + if func_description["name"] in name: + return func_description + return None + else: + # This is the parallel case, there is no need to loop through the list, as there is only one function + return func_descriptions + +def get_possible_answer_type(possible_answer: list): + for answer in possible_answer: + if answer != "": # Optional parameter + return type(answer) + return None \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/__init__.py new file mode 100644 index 000000000..2276fd33e --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/__init__.py @@ -0,0 +1,3 @@ +from .executable import ExecutableChecker + +__all__ = ['ExecutableChecker'] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exceptions.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exceptions.py new file mode 100644 index 000000000..3504862d8 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exceptions.py @@ -0,0 +1,10 @@ +class NoAPIKeyError(Exception): + def __init__(self): + self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." + super().__init__(self.message) + + +class BadAPIStatusError(Exception): + def __init__(self, errors, error_rate): + self.errors = errors + self.error_rate = error_rate \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/eval_checker/executable_python_function.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exec_python_functions.py similarity index 95% rename from berkeley-function-call-leaderboard/eval_checker/executable_python_function.py rename to berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exec_python_functions.py index e1f5a4665..93ed6d5b6 100644 --- a/berkeley-function-call-leaderboard/eval_checker/executable_python_function.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exec_python_functions.py @@ -1,17 +1,16 @@ -import json +import os import math -import requests -from custom_exception import NoAPIKeyError import time -api_key = {} -with open("../function_credential_config.json") as f: - data = json.loads(f.read()) - for item in data: - for k, v in item.items(): - if v == "": - raise NoAPIKeyError() - api_key[k] = v +import requests +from dotenv import load_dotenv + +load_dotenv() + +# Make sure the env variables are populated +env_vars = ('GEOCODE_API_KEY', 'RAPID_API_KEY', 'OMDB_API_KEY', 'EXCHANGERATE_API_KEY') +for var in env_vars: + assert (api_key := os.getenv(var)), f'Please provide your {var} in the `.env` file.' def calculate_triangle_area(base, height): @@ -338,7 +337,7 @@ def get_coordinates_from_city(city_name): """ time.sleep(2) # To avoid rate limiting url = "https://geocode.maps.co/search" - params = {"q": city_name, "api_key": api_key["GEOCODE-API-KEY"]} + params = {"q": city_name, "api_key": os.getenv("GEOCODE_API_KEY")} response = requests.get(url, params=params) if response.status_code == 200: @@ -363,7 +362,7 @@ def convert_currency(amount, from_currency, to_currency): Returns: float: The converted amount in the target currency. """ - key = api_key["EXCHANGERATE-API-KEY"] + key = os.getenv("EXCHANGERATE_API_KEY") base_url = f"https://v6.exchangerate-api.com/v6/{key}/latest/{from_currency}" response = requests.get(base_url) @@ -390,7 +389,7 @@ def find_term_on_urban_dictionary(term): querystring = {"term": term} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "mashape-community-urban-dictionary.p.rapidapi.com", } @@ -438,7 +437,7 @@ def get_covid_death_by_country(country): querystring = {"country": country} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "covid-193.p.rapidapi.com", } @@ -460,7 +459,7 @@ def get_active_covid_case_by_country(country): querystring = {"country": country} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "covid-193.p.rapidapi.com", } @@ -475,7 +474,7 @@ def get_rating_by_amazon_ASIN(ASIN): url = "https://real-time-amazon-data.p.rapidapi.com/product-details" querystring = {"asin": ASIN, "country": "US"} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", } @@ -497,7 +496,7 @@ def get_price_by_amazon_ASIN(ASIN): url = "https://real-time-amazon-data.p.rapidapi.com/product-details" querystring = {"asin": ASIN, "country": "US"} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", } @@ -519,7 +518,7 @@ def get_product_name_by_amazon_ASIN(ASIN): url = "https://real-time-amazon-data.p.rapidapi.com/product-details" querystring = {"asin": ASIN, "country": "US"} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", } @@ -548,7 +547,7 @@ def get_company_name_by_stock_name(stock_name): querystring = {"search": stock_name} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", } @@ -570,7 +569,7 @@ def get_stock_price_by_stock_name(stock_name): querystring = {"ticker": stock_name} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", } @@ -598,7 +597,7 @@ def get_stock_history(stock_name, interval, diffandsplits="true"): } headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", } @@ -648,7 +647,7 @@ def get_time_zone_by_coord(long, lat): querystring = {"lat": lat, "lon": long, "c": "1", "s": "0"} headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), "X-RapidAPI-Host": "timezone-by-location.p.rapidapi.com", } @@ -842,7 +841,7 @@ def get_movie_rating(movie_name): movie_name (str): The name of the movie. """ url = "http://www.omdbapi.com/" - params = {"t": movie_name, "apikey": api_key["OMDB-API-KEY"]} + params = {"t": movie_name, "apikey": os.getenv('OMDB_API_KEY')} response = requests.get(url, params=params) return response.json()["Rated"] @@ -854,7 +853,7 @@ def get_movie_director(movie_name): movie_name (str): The name of the movie. """ url = "http://www.omdbapi.com/" - params = {"t": movie_name, "apikey": api_key["OMDB-API-KEY"]} + params = {"t": movie_name, "apikey": os.getenv('OMDB_API_KEY')} response = requests.get(url, params=params) return response.json()["Director"] diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/executable.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/executable.py new file mode 100644 index 000000000..7a446fa51 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/executable.py @@ -0,0 +1,444 @@ +import os +import time +import json +from pathlib import Path +from typing import Dict, List + +from tqdm import tqdm + +from bfcl.types import LeaderboardExecutableCategory +from bfcl.evaluator.utils import display_api_status_error +from bfcl.evaluator.checker.types import CheckerResult +from bfcl.evaluator.checker.executable.exceptions import BadAPIStatusError, NoAPIKeyError + + + +class ExecutableChecker: + REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2 + + def __init__(self, cache_dir: str) -> None: + self.cache_dir = cache_dir + self.data_dir = Path(__file__, '../../../../..').resolve() / 'data' + self.rest_api_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_REST.jsonl' + self.executable_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_executable.jsonl' + + self.rest_eval_response_v5_file_path = self.data_dir / 'rest-eval-response_v5.jsonl' + with open(self.rest_eval_response_v5_file_path, 'r') as file: + self.rest_eval_response_data = [json.loads(line) for line in file] + + self._cached_exec_api_ground_truth_results = {} + + def perform_api_sanity_checks(self) -> None: + print("---- Sanity checking API status ----") + rest_api_error = executable_api_error = None + try: + self.rest_api_status_sanity_check() + except BadAPIStatusError as e: + rest_api_error = e + try: + self.executable_api_status_sanity_check() + except BadAPIStatusError as e: + executable_api_error = e + display_api_status_error(rest_api_error, executable_api_error, display_success=True) + + def rest_api_status_sanity_check(self) -> None: + # Use the ground truth data to make sure the API is working correctly + ground_truth_replaced = self._get_updated_rest_ground_truth_data() + correct_count = 0 + errors = [] + for idx, data in tqdm( + enumerate(ground_truth_replaced), + total=len(ground_truth_replaced), + desc="API Status Test (REST)", + ): + result = self.rest_executable_checker(data["ground_truth"], self.rest_eval_response_data[idx]) + if result.is_valid: + correct_count += 1 + else: + errors.append((data, result.model_dump())) + + if correct_count != len(ground_truth_replaced): + raise BadAPIStatusError(errors, f"{len(ground_truth_replaced) - correct_count} / {len(ground_truth_replaced)}") + + def executable_api_status_sanity_check(self) -> None: + with open(self.executable_ground_truth_file_path, 'r') as file: + ground_truth = [json.loads(line) for line in file] + + output_file_path = self.cache_dir / self.executable_ground_truth_file_path.name + if output_file_path.exists(): + with open(output_file_path, 'r') as file: + for line in file: + content = json.loads(line) + self._cached_exec_api_ground_truth_results[content['idx']] = content + + correct_count = 0 + errors = [] + for data in tqdm(ground_truth, total=len(ground_truth), desc="API Status Test (Non-REST)"): + idx = data['idx'] + if idx not in self._cached_exec_api_ground_truth_results: + self._cached_exec_api_ground_truth_results[idx] = data + result = self._simple_executable_checker( + data["ground_truth"][0], + data["execution_result"][0], + data["execution_result_type"][0], + True, + idx=idx + ) + if result.is_valid: + correct_count += 1 + else: + errors.append((data, result.model_dump())) + + # Save/update cache + with open(output_file_path, 'w') as file: + for _, v in sorted(self._cached_exec_api_ground_truth_results.items(), key=lambda x: x[0]): + file.write(json.dumps(v) + '\n') + + if correct_count != len(ground_truth): + raise BadAPIStatusError(errors, f"{len(ground_truth) - correct_count} / {len(ground_truth)}") + + def executable_checker( + self, + decoded_result: List, + func_description: Dict, + test_category: LeaderboardExecutableCategory + ) -> CheckerResult: + if 'multiple' in test_category.value or 'parallel' in test_category.value: + return self._parallel_no_order_executable_checker( + decoded_result, + func_description["execution_result"], + func_description["execution_result_type"], + ) + + else: + if len(decoded_result) != 1: + return CheckerResult( + is_valid=False, + error_type="simple_exec_checker:wrong_count", + error_message="Wrong number of functions." + ) + return self._simple_executable_checker( + decoded_result[0], + func_description["execution_result"][0], + func_description["execution_result_type"][0], + False, + ) + + def _get_updated_rest_ground_truth_data(self) -> List[Dict]: + output_file_path = self.cache_dir / self.rest_api_ground_truth_file_path.name + if output_file_path.exists(): + with open(output_file_path, 'r') as file: + modified_data = [json.loads(line) for line in file] + print(f'Loaded cached REST API ground truth file with replaced placeholders from "{output_file_path}" 🦍.') + else: + placeholders = {} + env_vars = ('GEOCODE_API_KEY', 'RAPID_API_KEY', 'OMDB_API_KEY', 'EXCHANGERATE_API_KEY') + for var in env_vars: + assert (api_key := os.getenv(var)), f'Please provide your {var} in the `.env` file.' + placeholders['YOUR-' + var.replace('_', '-')] = api_key + print("All API keys are present.") + + def replace_placeholders(data): + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, (dict, list)): + replace_placeholders(value) + elif isinstance(value, str): + for placeholder, actual_value in placeholders.items(): + if placeholder in value: # Check if placeholder is in the string + data[key] = value.replace(placeholder, actual_value) + elif isinstance(data, list): + for idx, item in enumerate(data): + if isinstance(item, (dict, list)): + replace_placeholders(item) + elif isinstance(item, str): + for placeholder, actual_value in placeholders.items(): + if placeholder in item: # Check if placeholder is in the string + data[idx] = item.replace(placeholder, actual_value) + return data + + modified_data = [] + with open(self.rest_api_ground_truth_file_path, 'r') as file: + for line in file: + try: + data = replace_placeholders(json.loads(line)) + modified_data.append(data) + except json.JSONDecodeError: + # Handle the case where a line is not a valid JSON object + print('Invalid JSON line!') + + with open(output_file_path, 'w') as f: + for modified_line in modified_data: + f.write(json.dumps(modified_line) + '\n') + print(f'Saved REST API ground truth file with replaced placeholders at {output_file_path} 🦍.') + + return modified_data + + def rest_executable_checker(self, func_call, eval_ground_truth) -> CheckerResult: + if "https://geocode.maps.co" in func_call: + time.sleep(2) + func_call = func_call.replace("requests_get", "requests.get") + try: + response = {} + exec("import requests;response=" + func_call, response) + response = response['response'] + except Exception as e: + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:execution_error", + error_message=f"Execution failed. {str(e)}" + ) + try: + if response.status_code != 200: + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_status_code", + error_message=f"Execution result status code is not 200, got {response.status_code}", + ) + except Exception as e: + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:cannot_get_status_code", + error_message=f"Cannot get status code of the response. Error: {str(e)}", + ) + try: + if isinstance(eval_ground_truth, dict): + if isinstance(response.json(), dict): + if set(eval_ground_truth.keys()) == set(response.json().keys()): + return CheckerResult(is_valid=True, error_type="", error_message="") + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_key", + error_message="Key inconsistency" + ) + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_type", + error_message=f"Expected dictionary, but got {type(response.json())}" + ) + elif isinstance(eval_ground_truth, list): + if isinstance(response.json(), list): + if len(eval_ground_truth) != len(response.json()): + return CheckerResult( + is_valid=False, + error_type="value_error:exec_result_rest_count", + error_message="Response list length inconsistency." + ) + else: + for i in range(len(eval_ground_truth)): + if set(eval_ground_truth[i]) != set(response.json()[i]): + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_key", + error_message="Key inconsistency" + ) + + return CheckerResult(is_valid=True, error_type="", error_message="") + else: + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_type", + error_message=f"Expected list, but got {type(response.json())}" + ) + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_type", + error_message=f"Expected dict or list, but got {type(response.json())}" + ) + except Exception as e: + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:response_format_error", + error_message=f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}", + ) + + def _simple_executable_checker( + self, + function_call: str, + expected_result, + expected_result_type: str, + is_sanity_check=False, + idx: int | None = None + ) -> CheckerResult: + result = CheckerResult(is_valid=True, error_type="executable_checker:unclear", error_message="") + exec_output = None + try: + if idx is not None: + exec_output = self._cached_exec_api_ground_truth_results[idx].get('exec_output') + if exec_output is None: + exec_dict = {} + # TODO: Instead of importing all the functions, we can use regex to extract + # the function name from the `function_call` and only import that function. + exec( + "from bfcl.evaluator.checker.executable.exec_python_functions import *" + "\nresult=" + function_call, + exec_dict, + ) + exec_output = exec_dict["result"] + if idx is not None: + self._cached_exec_api_ground_truth_results[idx]['exec_output'] = exec_output + except NoAPIKeyError as e: + raise e + except Exception as e: + return CheckerResult( + is_valid=False, + error_type="executable_checker:execution_error", + error_message=f"Error in execution: {repr(function_call)}. Error: {str(e)}" + ) + + # We need to special handle the case where the execution result is a tuple and convert it to a list + # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json + if isinstance(exec_output, tuple): + exec_output = list(exec_output) + + if expected_result_type == "exact_match": + if exec_output != expected_result: + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result", + error_message=f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}.", + model_executed_output=exec_output + ) + + elif expected_result_type == "real_time_match": + # Allow for 5% difference + if (type(expected_result) == float or type(expected_result) == int) and ( + type(exec_output) == float or type(exec_output) == int + ): + if not ( + expected_result * (1 - ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE) + <= exec_output + <= expected_result * (1 + ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE) + ): + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_real_time", + error_message=( + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, " + f"but got: {exec_output}. {ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed." + ), + model_executed_output=exec_output + ) + else: + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_real_time", + error_message=( + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, " + f"but got: {exec_output}. Type needs to be float or int for real time match criteria." + ), + model_executed_output=exec_output + ) + else: + # Structural match + pattern_match_result = self._pattern_matcher(exec_output, expected_result, function_call, is_sanity_check) + if not pattern_match_result.is_valid: + return pattern_match_result + + return result + + def _parallel_no_order_executable_checker( + self, + decoded_result: List, + expected_exec_result: List, + expected_exec_result_type: List + ) -> CheckerResult: + if len(decoded_result) != len(expected_exec_result): + return CheckerResult( + is_valid=False, + error_type="value_error:exec_result_count", + error_message=f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}." + ) + + matched_indices = [] + for i in range(len(expected_exec_result)): + all_errors = [] + for index in range(len(decoded_result)): + if index in matched_indices: + continue + + result = self._simple_executable_checker( + decoded_result[index], + expected_exec_result[i], + expected_exec_result_type[i], + False, + ) + + if result.is_valid: + matched_indices.append(index) + break + else: + all_errors.append( + { + f"Model Result Index {index}": { + "sub_error": result.error_message, + "sub_error_type": result.error_type, + "model_executed_output": ( + result.model_executed_output if hasattr(result, "model_executed_output") else None + ), + } + } + ) + + if not result.is_valid: + considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices] + error_message = ( + f"Could not find a matching function among index {considered_indices} of model " + f"output for index {i} of possible answers." + ) + error_message += "\nErrors:\n" + '\n'.join(map(json.dumps, all_errors)) + return CheckerResult( + is_valid=False, + error_type="executable_checker:cannot_find_match", + error_message=error_message + ) + return CheckerResult(is_valid=True, error_type="executable_checker:unclear", error_message="") + + @staticmethod + def _pattern_matcher(exec_output, expected_result, function_call, is_sanity_check) -> CheckerResult: + result = CheckerResult(is_valid=True, error_type="executable_checker:unclear", error_message="") + if type(exec_output) != type(expected_result): + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type", + error_message=f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}", + model_executed_output=exec_output + ) + if isinstance(exec_output, dict): + # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one. + # This happens when the key is a timestamp or a random number. + if is_sanity_check: + if len(exec_output) != len(expected_result): + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type:dict_length", + error_message=f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}.", + model_executed_output=exec_output + ) + else: + return result + + for key in expected_result: + if key not in exec_output: + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type:dict_key_not_found", + error_message=f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output.", + model_executed_output=exec_output + ) + for key in exec_output: + if key not in expected_result: + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type:dict_extra_key", + error_message=f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output.", + model_executed_output=exec_output + ) + if isinstance(exec_output, list): + if len(exec_output) != len(expected_result): + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type:list_length", + error_message=f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}.", + model_executed_output=exec_output + ) + return result \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/types.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/types.py new file mode 100644 index 000000000..b1fb68a88 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/types.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + + +class CheckerResult(BaseModel): + is_valid: bool + error_type: str + error_message: str + + class Config: + extra = 'allow' \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py b/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py new file mode 100644 index 000000000..bde259c15 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py @@ -0,0 +1,450 @@ +INPUT_PRICE_PER_MILLION_TOKEN = { + "claude-3-opus-20240229-FC": 15, + "claude-3-opus-20240229": 15, + "claude-3-sonnet-20240229-FC": 3, + "claude-3-sonnet-20240229": 3, + "claude-3-haiku-20240307-FC": 0.25, + "claude-3-haiku-20240307": 0.25, + "claude-3-5-sonnet-20240620-FC": 3, + "claude-3-5-sonnet-20240620": 3, + "claude-2.1": 8, + "claude-instant-1.2": 0.8, + "mistral-large-2402-FC-Any": 4, + "mistral-large-2402-FC-Auto": 4, + "mistral-medium-2312": 2.7, + "mistral-small-2402-FC-Any": 1, + "mistral-small-2402-FC-Auto": 1, + "mistral-small-2402": 1, + "mistral-tiny-2312": 0.25, + "gpt-4o-2024-05-13-FC": 5, + "gpt-4o-2024-05-13": 5, + "gpt-4-1106-preview-FC": 10, + "gpt-4-1106-preview": 10, + "gpt-4-0125-preview": 10, + "gpt-4-0125-preview-FC": 10, + "gpt-4-turbo-2024-04-09-FC": 10, + "gpt-4-turbo-2024-04-09": 10, + "gpt-4-0613": 30, + "gpt-4-0613-FC": 30, + "gpt-3.5-turbo-0125": 0.5, + "gpt-3.5-turbo-0125-FC": 0.5, + "gemini-1.0-pro": 0.5, + "gemini-1.5-pro-preview-0409": 3.5, + "gemini-1.5-pro-preview-0514": 3.5, + "gemini-1.5-flash-preview-0514": 0.35, + "databricks-dbrx-instruct": 2.25, + "command-r-plus-FC": 3, + "command-r-plus": 3, + "command-r-plus-FC-optimized": 3, + "command-r-plus-optimized": 3, +} + +OUTPUT_PRICE_PER_MILLION_TOKEN = { + "claude-3-opus-20240229-FC": 75, + "claude-3-opus-20240229": 75, + "claude-3-sonnet-20240229-FC": 15, + "claude-3-sonnet-20240229": 15, + "claude-3-5-sonnet-20240620-FC": 15, + "claude-3-5-sonnet-20240620": 15, + "claude-3-haiku-20240307-FC": 1.25, + "claude-3-haiku-20240307": 1.25, + "claude-2.1": 24, + "claude-instant-1.2": 2.4, + "mistral-large-2402-FC-Any": 12, + "mistral-large-2402-FC-Auto": 12, + "mistral-small-2402": 3, + "mistral-medium-2312": 8.1, + "mistral-small-2402-FC-Any": 3, + "mistral-small-2402-FC-Auto": 3, + "mistral-tiny-2312": 0.25, + "gpt-4o-2024-05-13-FC": 15, + "gpt-4o-2024-05-13": 15, + "gpt-4-turbo-2024-04-09-FC": 30, + "gpt-4-turbo-2024-04-09": 30, + "gpt-4-1106-preview": 30, + "gpt-4-1106-preview-FC": 30, + "gpt-4-0125-preview-FC": 30, + "gpt-4-0125-preview": 30, + "gpt-4-0613": 60, + "gpt-4-0613-FC": 60, + "gpt-3.5-turbo-0125": 1.5, + "gpt-3.5-turbo-0125-FC": 1.5, + "gemini-1.0-pro": 1.5, + "gemini-1.5-pro-preview-0409": 10.50, + "gemini-1.5-pro-preview-0514": 10.50, + "gemini-1.5-flash-preview-0514": 0.53, + "databricks-dbrx-instruct": 6.75, + "command-r-plus-FC": 15, + "command-r-plus": 15, + "command-r-plus-FC-optimized": 15, + "command-r-plus-optimized": 15, +} + +# The latency of the open-source models are hardcoded here. +# Because we do batching when generating the data, so the latency is not +# accurate from the result data. +# This is the latency for the whole batch of data, when using 8 V100 GPUs. +OSS_LATENCY = { + "deepseek-ai/deepseek-coder-6.7b-instruct": 909, + "google/gemma-7b-it": 95, + "NousResearch/Hermes-2-Pro-Mistral-7B": 135, + "meta-llama/Meta-Llama-3-8B-Instruct": 73, + "meta-llama/Meta-Llama-3-70B-Instruct": 307, + "gorilla-openfunctions-v2": 83, + "THUDM/glm-4-9b-chat": 223 +} + +# Price got from Azure, 22.032 per hour for 8 V100, Pay As You Go Total Price +# Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/ +V100_x8_PRICE_PER_HOUR = 22.032 + +NO_COST_MODELS = [ + "Nexusflow-Raven-v2", + "firefunction-v1-FC", + "firefunction-v2-FC", + "meetkai/functionary-medium-v2.4-FC", + "meetkai/functionary-small-v2.2-FC", + "meetkai/functionary-small-v2.4-FC", + "snowflake/arctic", + "nvidia/nemotron-4-340b-instruct", + "THUDM/glm-4-9b-chat", +] + +MODEL_METADATA_MAPPING = { + "gpt-4o-2024-05-13-FC": [ + "GPT-4o-2024-05-13 (FC)", + "https://openai.com/index/hello-gpt-4o/", + "OpenAI", + "Proprietary", + ], + "gpt-4o-2024-05-13": [ + "GPT-4o-2024-05-13 (Prompt)", + "https://openai.com/index/hello-gpt-4o/", + "OpenAI", + "Proprietary", + ], + "gpt-4-1106-preview-FC": [ + "GPT-4-1106-Preview (FC)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-1106-preview": [ + "GPT-4-1106-Preview (Prompt)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-0125-preview-FC": [ + "GPT-4-0125-Preview (FC)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-0125-preview": [ + "GPT-4-0125-Preview (Prompt)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-turbo-2024-04-09-FC": [ + "GPT-4-turbo-2024-04-09 (FC)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-turbo-2024-04-09": [ + "GPT-4-turbo-2024-04-09 (Prompt)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gorilla-openfunctions-v2": [ + "Gorilla-OpenFunctions-v2 (FC)", + "https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html", + "Gorilla LLM", + "Apache 2.0", + ], + "claude-3-opus-20240229-FC": [ + "Claude-3-Opus-20240229 (FC tools-2024-04-04)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-opus-20240229": [ + "Claude-3-Opus-20240229 (Prompt)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "mistral-medium-2312": [ + "Mistral-Medium-2312 (Prompt)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-small-2402": [ + "Mistral-Small-2402 (Prompt)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-large-2402": [ + "Mistral-Large-2402 (Prompt)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "claude-3-sonnet-20240229-FC": [ + "Claude-3-Sonnet-20240229 (FC tools-2024-04-04)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-sonnet-20240229": [ + "Claude-3-Sonnet-20240229 (Prompt)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-haiku-20240307-FC": [ + "Claude-3-Haiku-20240307 (FC tools-2024-04-04)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-haiku-20240307": [ + "Claude-3-Haiku-20240307 (Prompt)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-5-sonnet-20240620-FC": [ + "Claude-3.5-Sonnet-20240620 (FC)", + "https://www.anthropic.com/news/claude-3-5-sonnet", + "Anthropic", + "Proprietary", + ], + "claude-3-5-sonnet-20240620": [ + "Claude-3.5-Sonnet-20240620 (Prompt)", + "https://www.anthropic.com/news/claude-3-5-sonnet", + "Anthropic", + "Proprietary", + ], + "gpt-3.5-turbo-0125-FC": [ + "GPT-3.5-Turbo-0125 (FC)", + "https://platform.openai.com/docs/models/gpt-3-5-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-3.5-turbo-0125": [ + "GPT-3.5-Turbo-0125 (Prompting)", + "https://platform.openai.com/docs/models/gpt-3-5-turbo", + "OpenAI", + "Proprietary", + ], + "meetkai/functionary-small-v2.2-FC": [ + "Functionary-Small-v2.2 (FC)", + "https://huggingface.co/meetkai/functionary-small-v2.2", + "MeetKai", + "MIT", + ], + "meetkai/functionary-medium-v2.2-FC": [ + "Functionary-Medium-v2.2 (FC)", + "https://huggingface.co/meetkai/functionary-medium-v2.2", + "MeetKai", + "MIT", + ], + "meetkai/functionary-small-v2.4-FC": [ + "Functionary-Small-v2.4 (FC)", + "https://huggingface.co/meetkai/functionary-small-v2.4", + "MeetKai", + "MIT", + ], + "meetkai/functionary-medium-v2.4-FC": [ + "Functionary-Medium-v2.4 (FC)", + "https://huggingface.co/meetkai/functionary-medium-v2.4", + "MeetKai", + "MIT", + ], + "claude-2.1": [ + "Claude-2.1 (Prompt)", + "https://www.anthropic.com/news/claude-2-1", + "Anthropic", + "Proprietary", + ], + "mistral-tiny-2312": [ + "Mistral-tiny-2312 (Prompt)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "claude-instant-1.2": [ + "Claude-instant-1.2 (Prompt)", + "https://www.anthropic.com/news/releasing-claude-instant-1-2", + "Anthropic", + "Proprietary", + ], + "mistral-small-2402-FC-Auto": [ + "Mistral-small-2402 (FC Auto)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-large-2402-FC-Any": [ + "Mistral-large-2402 (FC Any)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-small-2402-FC-Any": [ + "Mistral-small-2402 (FC Any)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-large-2402-FC-Auto": [ + "Mistral-large-2402 (FC Auto)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "Nexusflow-Raven-v2": [ + "Nexusflow-Raven-v2 (FC)", + "https://huggingface.co/Nexusflow/NexusRaven-V2-13B", + "Nexusflow", + "Apache 2.0", + ], + "firefunction-v1-FC": [ + "FireFunction-v1 (FC)", + "https://huggingface.co/fireworks-ai/firefunction-v1", + "Fireworks", + "Apache 2.0", + ], + "firefunction-v2-FC": [ + "FireFunction-v2 (FC)", + "https://huggingface.co/fireworks-ai/firefunction-v2", + "Fireworks", + "Apache 2.0", + ], + "gemini-1.5-pro-preview-0514": [ + "Gemini-1.5-Pro-Preview-0514 (FC)", + "https://deepmind.google/technologies/gemini/pro/", + "Google", + "Proprietary", + ], + "gemini-1.5-flash-preview-0514": [ + "Gemini-1.5-Flash-Preview-0514 (FC)", + "https://deepmind.google/technologies/gemini/flash/", + "Google", + "Proprietary", + ], + "gemini-1.5-pro-preview-0409": [ + "Gemini-1.5-Pro-Preview-0409 (FC)", + "https://deepmind.google/technologies/gemini/#introduction", + "Google", + "Proprietary", + ], + "gemini-1.0-pro": [ + "Gemini-1.0-Pro-001 (FC)", + "https://deepmind.google/technologies/gemini/#introduction", + "Google", + "Proprietary", + ], + "gpt-4-0613-FC": [ + "GPT-4-0613 (FC)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-0613": [ + "GPT-4-0613 (Prompt)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "deepseek-ai/deepseek-coder-6.7b-instruct": [ + "Deepseek-v1.5 (Prompt)", + "https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5", + "Deepseek", + "Deepseek License", + ], + "google/gemma-7b-it": [ + "Gemma-7b-it (Prompt)", + "https://blog.google/technology/developers/gemma-open-models/", + "Google", + "gemma-terms-of-use", + ], + "glaiveai/glaive-function-calling-v1": [ + "Glaive-v1 (FC)", + "https://huggingface.co/glaiveai/glaive-function-calling-v1", + "Glaive", + "cc-by-sa-4.0", + ], + "databricks-dbrx-instruct": [ + "DBRX-Instruct (Prompt)", + "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm", + "Databricks", + "Databricks Open Model", + ], + "NousResearch/Hermes-2-Pro-Mistral-7B": [ + "Hermes-2-Pro-Mistral-7B (FC)", + "https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B", + "NousResearch", + "apache-2.0", + ], + "meta-llama/Meta-Llama-3-8B-Instruct": [ + "Meta-Llama-3-8B-Instruct (Prompt)", + "https://llama.meta.com/llama3", + "Meta", + "Meta Llama 3 Community", + ], + "meta-llama/Meta-Llama-3-70B-Instruct": [ + "Meta-Llama-3-70B-Instruct (Prompt)", + "https://llama.meta.com/llama3", + "Meta", + "Meta Llama 3 Community", + ], + "command-r-plus-FC": [ + "Command-R-Plus (FC) (Original)", + "https://txt.cohere.com/command-r-plus-microsoft-azure", + "Cohere For AI", + "cc-by-nc-4.0", + ], + "command-r-plus": [ + "Command-R-Plus (Prompt) (Original)", + "https://txt.cohere.com/command-r-plus-microsoft-azure", + "Cohere For AI", + "cc-by-nc-4.0", + ], + "command-r-plus-FC-optimized": [ + "Command-R-Plus (FC) (Optimized)", + "https://txt.cohere.com/command-r-plus-microsoft-azure", + "Cohere For AI", + "cc-by-nc-4.0", + ], + "command-r-plus-optimized": [ + "Command-R-Plus (Prompt) (Optimized)", + "https://txt.cohere.com/command-r-plus-microsoft-azure", + "Cohere For AI", + "cc-by-nc-4.0", + ], + "snowflake/arctic": [ + "Snowflake/snowflake-arctic-instruct (Prompt)", + "https://huggingface.co/Snowflake/snowflake-arctic-instruct", + "Snowflake", + "apache-2.0", + ], + "nvidia/nemotron-4-340b-instruct": [ + "Nemotron-4-340b-instruct (Prompt)", + "https://huggingface.co/nvidia/nemotron-4-340b-instruct", + "NVIDIA", + "nvidia-open-model-license" + ], + "THUDM/glm-4-9b-chat": [ + "GLM-4-9b-Chat (FC)", + "https://huggingface.co/THUDM/glm-4-9b-chat", + "THUDM", + "glm-4" + ] +} \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py new file mode 100644 index 000000000..1a1036558 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py @@ -0,0 +1,474 @@ +import json +import warnings +from pathlib import Path +from typing import List, Dict, Any + +import pandas as pd +from tqdm import tqdm +from pydantic import BaseModel + +import bfcl.types as types +from bfcl.model_handler.base import BaseHandler +from bfcl.evaluator.metrics import LeaderboardModelMetrics +from bfcl.evaluator import checker, utils as evaluator_utils +from bfcl.evaluator.constants import MODEL_METADATA_MAPPING + + +class FailedResult(BaseModel): + example_id: str + test_category: str + is_valid: bool + error_type: str + error_message: str + llm_response: str + decoded_result: Any | None = None + + class Config: + extra = 'allow' + + +class LeaderboardEvaluator: + def __init__( + self, + model_handler: BaseHandler, + leaderboard: types.Leaderboard, + perform_api_sanity_check: bool + ) -> None: + self.model_name = model_handler.model_name + self.model_handler = model_handler + self.leaderboard = leaderboard + self.perform_api_sanity_check = perform_api_sanity_check + self.test_category_to_data = leaderboard.load_test_data() + + self._executable_checker = None + self._ast_checker = None + self._model_metrics = LeaderboardModelMetrics(self.model_name) + self._test_category_to_metrics = {} + + def __call__(self, file_path: Path, test_category) -> None: + model_responses = self.model_handler.load_model_responses(file_path.name) + if model_responses is None: + print(f'Skipping evaluation of test category "{test_category.value}" due to empty model responses!') + return + + print('🔍 Running test:', test_category.value) + self._model_metrics(model_responses) + + result = None + if test_category.value == types.LeaderboardCategory.RELEVANCE.value: + result = self.run_relevance_evaluator(model_responses) + elif test_category.value in types.LeaderboardExecutableCategory: + if self._executable_checker is None: + self._executable_checker = checker.ExecutableChecker(self.leaderboard.cache_dir) + if self.perform_api_sanity_check: + self._executable_checker.perform_api_sanity_checks() + result = self.run_executable_evaluator(test_category, model_responses) + elif test_category.value in types.LeaderboardAstCategory: + if self._ast_checker is None: + self._ast_checker = checker.AstChecker(self.model_name, self.leaderboard) + result = self.run_ast_evaluator(test_category, model_responses) + + if result: + accuracy = result['accuracy'] + self._test_category_to_metrics[test_category.value] = dict( + accuracy=accuracy, + total_count=result['total_count'] + ) + print(f"✅ Test completed: {test_category.value} | 🎯 Accuracy: {accuracy:.4f}") + + def generate_leaderboard_csv(self) -> None: + metrics = self._test_category_to_metrics + C = types.LeaderboardCategory + + python_simple_ast = metrics.get(C.SIMPLE.value, dict(accuracy=0, total_count=0)) + python_multiple_ast = metrics.get(C.MULTIPLE_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_parallel_ast = metrics.get(C.PARALLEL_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_parallel_multiple_ast = metrics.get(C.PARALLEL_MULTIPLE_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_simple_exec = metrics.get(C.EXECUTABLE_SIMPLE.value, dict(accuracy=0, total_count=0)) + python_multiple_exec = metrics.get(C.EXECUTABLE_MULTIPLE_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_parallel_exec = metrics.get(C.EXECUTABLE_PARALLEL_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_parallel_multiple_exec = metrics.get(C.EXECUTABLE_PARALLEL_MULTIPLE_FUNCTION.value, dict(accuracy=0, total_count=0)) + java_simple_ast = metrics.get(C.JAVA.value, dict(accuracy=0, total_count=0)) + javascript_simple_ast = metrics.get(C.JAVASCRIPT.value, dict(accuracy=0, total_count=0)) + rest_simple_exec = metrics.get(C.REST.value, dict(accuracy=0, total_count=0)) + relevance = metrics.get(C.RELEVANCE.value, dict(accuracy=0, total_count=0)) + + simple_ast = evaluator_utils.calculate_weighted_accuracy( + [python_simple_ast, java_simple_ast, javascript_simple_ast] + ) + multiple_ast = python_multiple_ast + parallel_ast = python_parallel_ast + parallel_multiple_ast = python_parallel_multiple_ast + simple_exec = evaluator_utils.calculate_weighted_accuracy( + [python_simple_exec, rest_simple_exec] + ) + multiple_exec = python_multiple_exec + parallel_exec = python_parallel_exec + parallel_multiple_exec = python_parallel_multiple_exec + + summary_ast = evaluator_utils.calculate_unweighted_accuracy( + [simple_ast, multiple_ast, parallel_ast, parallel_multiple_ast] + ) + summary_exec = evaluator_utils.calculate_unweighted_accuracy( + [simple_exec, multiple_exec, parallel_exec, parallel_multiple_exec] + ) + overall_accuracy = evaluator_utils.calculate_weighted_accuracy( + [ + simple_ast, + multiple_ast, + parallel_ast, + parallel_multiple_ast, + simple_exec, + multiple_exec, + parallel_exec, + parallel_multiple_exec, + relevance, + ] + ) + + # if overall_accuracy["total_count"] != 1700: + # print("-" * 100) + # print(f"❗️Warning: Total count for {self.model_name} is {overall_accuracy['total_count']}") + + # Model metrics - cost, mean_latency, std_latency, p95_latency + model_metrics = self._model_metrics.compute() + model_metadata = MODEL_METADATA_MAPPING.get(self.model_name) + if model_metadata is None: + warnings.warn( + f'Metadata not found for the model "{self.model_name}"! ' + 'Please add your model metadata in the `MODEL_METADATA_MAPPING` variable ' + 'in the `bfcl/evaluator/constants.py` file.' + ) + + f_acc = lambda acc: "{:.2f}%".format(acc * 100) + rv_f_acc = lambda acc_str: float(acc_str.replace('%', '')) / 100 + + row = { + "Rank": 0, # Temporary value of 0. Updated below. + "Overall Acc": f_acc(overall_accuracy["accuracy"]), + "Model": model_metadata[0] if model_metadata else self.model_name, + "Model Link": model_metadata[1] if model_metadata else "N/A", + "Organization": model_metadata[2] if model_metadata else "N/A", + "License": model_metadata[3] if model_metadata else "N/A", + "AST Summary": f_acc(summary_ast["accuracy"]), + "Exec Summary": f_acc(summary_exec["accuracy"]), + "Simple Function AST": f_acc(simple_ast["accuracy"]), + "Python Simple Function AST": f_acc(python_simple_ast["accuracy"]), + "Java Simple Function AST": f_acc(java_simple_ast["accuracy"]), + "JavaScript Simple Function AST": f_acc(javascript_simple_ast["accuracy"]), + "Multiple Functions AST": f_acc(multiple_ast["accuracy"]), + "Parallel Functions AST": f_acc(parallel_ast["accuracy"]), + "Parallel Multiple AST": f_acc(parallel_multiple_ast["accuracy"]), + "Simple Function Exec": f_acc(simple_exec["accuracy"]), + "Python Simple Function Exec": f_acc(python_simple_exec["accuracy"]), + "REST Simple Function Exec": f_acc(rest_simple_exec["accuracy"]), + "Multiple Functions Exec": f_acc(multiple_exec["accuracy"]), + "Parallel Functions Exec": f_acc(parallel_exec["accuracy"]), + "Parallel Multiple Exec": f_acc(parallel_multiple_exec["accuracy"]), + "Relevance Detection": f_acc(relevance["accuracy"]), + "Cost ($ Per 1k Function Calls)": str(model_metrics['cost']), + "Latency Mean (s)": str(model_metrics['mean_latency']), + "Latency Standard Deviation (s)": str(model_metrics['std_latency']), + "Latency 95th Percentile (s)": str(model_metrics['p95_latency']), + } + + df_new = pd.DataFrame([row]) + file_path = self.model_handler.result_dir / 'BFCL_leaderboard_result.csv' + if file_path.exists(): + print('Found existing BFCL leaderboard file! Loading...') + existing_df = pd.read_csv(file_path, dtype=str) + + # Check if model name already exists + if df_new["Model"].iloc[0] in existing_df["Model"].values: + print('Model already exists. Overwriting the row...') + existing_df.loc[existing_df["Model"] == df_new["Model"].iloc[0], :] = df_new.values + else: + print('Appending new model to the existing dataframe...') + existing_df = pd.concat((existing_df, df_new), ignore_index=True) + df = existing_df + else: + print('No existing BFCL leaderboard file found. Creating a new one...') + df = df_new + + df["Overall Acc"] = df["Overall Acc"].apply(rv_f_acc) + df.sort_values("Overall Acc", ascending=False, inplace=True) + df["Overall Acc"] = df["Overall Acc"].apply(f_acc) + df['Rank'] = list(range(1, len(df) + 1)) + + df.to_csv(file_path, index=False) + print(f'🔒 Saved BFCL leaderboard result at "{file_path}".') + + def run_relevance_evaluator(self, model_responses: List[Dict]) -> Dict: + """Run function relevance detection. + + In relevance detection, we design a scenario where none of the provided functions + are relevant and supposed to be invoked. We expect the model's output to be no + function call.""" + + failed_model_responses = [] + correct_count = 0 + for response in tqdm(model_responses, total=len(model_responses), desc="Evaluating"): + model_response = response['response'] + success = False + decoded_result = None + try: + decoded_result = self.model_handler.decode_ast(model_response, language='python') + success = evaluator_utils.is_empty_output(decoded_result) + except Exception: + success = True + + if success: + correct_count += 1 + else: + result = FailedResult( + example_id=response['id'], + test_category=types.LeaderboardCategory.RELEVANCE.value, + is_valid=False, + error_type='relevance_error:decoder_success', + error_message='Valid syntax. Successfully decode AST when it should not.', + llm_response=model_response, + decoded_result=decoded_result, + ) + failed_model_responses.append(result) + + result = dict( + accuracy=correct_count / len(model_responses), + correct_count=correct_count, + total_count=len(model_responses), + failed_model_responses=failed_model_responses, + ) + self._save_scores(types.LeaderboardCategory.RELEVANCE, result) + return result + + def run_executable_evaluator( + self, + test_category: types.LeaderboardCategory, + model_responses: List[Dict] + ) -> Dict: + """Run executable function/API evaluator. + + Invoke function or API for the documentation provided. The accuracy + is measured by actually running the function call with function + source code loaded.""" + + test_data = self.test_category_to_data[test_category] + assert len(model_responses) == len(test_data) + test_example_id_to_data = {} + if test_category.value != types.LeaderboardExecutableCategory.REST.value: + print(f"---- Getting real-time execution result from ground truth for '{test_category.value}' ----") + exec_dict = {} + for item in tqdm(test_data, desc="Getting Executable Expected Output"): + execution_result = item.get('execution_result') + if execution_result is None or not all(execution_result): # Check if cached value is None then try again. + execution_result = [] + ground_truth = item["ground_truth"] + for i in range(len(ground_truth)): + exec( + "from bfcl.evaluator.checker.executable.exec_python_functions import *" + + "\nresult=" + ground_truth[i], + exec_dict, + ) + execution_result.append(exec_dict["result"]) + item["execution_result"] = execution_result + test_example_id_to_data[item['id']] = item + + # Save the test dataset with the added `execution_result` key + # TODO: Decide if you want to cache the execution results or not. + # Edge case: We don't validate the `execution_result` value, hence if the user didn't setup the + # environment variables correctly and we get incorrect `execution_result` from the + # `exec_python_functions`, those values will be cached. + file_path = self.leaderboard.test_data_cache_dir / self.leaderboard.get_file_name(test_category) + with open(file_path, 'w') as file: + for line in test_data: + file.write(json.dumps(line) + '\n') + + print(f"---- Ground truth real-time execution result obtained for '{test_category.value}' 🌟 ----") + + failed_model_responses = [] + correct_count = 0 + for idx, response in tqdm(enumerate(model_responses), total=len(model_responses), desc="Evaluating"): + model_response = response['response'] + try: + decoded_result = self.model_handler.decode_execute(model_response) + except Exception as e: + result = FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_type='executable_decoder:decoder_failed', + error_message=f"Failed to decode executable. {str(e)}", + llm_response=model_response, + decoded_result=decoded_result, + ) + failed_model_responses.append(result) + continue + + if test_category.value == types.LeaderboardExecutableCategory.REST.value: + # REST is always single-functioned. Therefore we take the first one and pass + # it to the REST checker. + if not evaluator_utils.is_rest_format_output(decoded_result): + result = FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_type='executable_decoder:rest_wrong_output_format', + error_message=( + 'Did not output in the specified format. Note: the model_result is wrapped in a ' + 'string to ensure json serializability.' + ), + llm_response=str(model_response), + decoded_result=str(decoded_result), + ) + failed_model_responses.append(result) + continue + + checker_result = self._executable_checker.rest_executable_checker( + decoded_result[0], + eval_ground_truth=self._executable_checker.rest_eval_response_data[idx] + ) + else: + if not evaluator_utils.is_executable_format_output(decoded_result): + result = FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_type='executable_decoder:wrong_output_format', + error_message=( + 'Did not output in the specified format. Note: the model_result is wrapped in a ' + 'string to ensure json serializability.' + ), + llm_response=str(model_response), + decoded_result=str(decoded_result), + ) + failed_model_responses.append(result) + continue + + checker_result = self._executable_checker.executable_checker( + decoded_result, + test_example_id_to_data[response['id']], + test_category + ) + + if checker_result.is_valid: + correct_count += 1 + else: + result = FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=checker_result.is_valid, + error_type=checker_result.error_type, + error_message=checker_result.error_message, + llm_response=model_response, + decoded_result=decoded_result, + ) + if hasattr(checker_result, "model_executed_output"): + result.model_executed_output = checker_result.model_executed_output + failed_model_responses.append(result) + + result = dict( + accuracy=correct_count / len(model_responses), + correct_count=correct_count, + total_count=len(model_responses), + failed_model_responses=failed_model_responses, + ) + self._save_scores(test_category, result) + return result + + def run_ast_evaluator( + self, + test_category: types.LeaderboardCategory, + model_responses: List[Dict] + ) -> Dict: + + self._ast_checker.load_possible_answers(test_category) + test_data = self.test_category_to_data[test_category] + possible_answers = self._ast_checker.test_category_to_possible_ans[test_category] + language = self._ast_checker.get_language(test_category) + assert len(model_responses) == len(test_data) == len(possible_answers), ( + "No. of the model responses does not match the no. of test data or " + "no. of possible answers. Please check the input files for completeness." + ) + + test_example_id_to_data = {data['id']: data for data in test_data} + failed_model_responses = [] + correct_count = 0 + for idx, response in tqdm(enumerate(model_responses), total=len(model_responses), desc="Evaluating"): + model_result_item = response['response'] + possible_answer_item = possible_answers[idx] + + try: + model_result_item_raw = model_result_item + model_result_item = self.model_handler.decode_ast(model_result_item, language) + except Exception as e: + failed_model_responses.append( + FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_message=f"Invalid syntax. Failed to decode AST. {str(e)}", + error_type="ast_decoder:decoder_failed", + llm_response=model_result_item_raw, + possible_answer=possible_answer_item, + ) + ) + continue + + decoder_output_valid = evaluator_utils.is_function_calling_format_output(model_result_item) + if not decoder_output_valid: + failed_model_responses.append( + FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_message="Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability.", + error_type="ast_decoder:decoder_wrong_output_format", + llm_response=str(model_result_item_raw), + decoded_result=str(model_result_item), + possible_answer=possible_answer_item, + ) + ) + continue + + checker_result = self._ast_checker( + idx, + test_example_id_to_data[response['id']]['function'], + model_result_item, + test_category, + ) + + if checker_result.is_valid: + correct_count += 1 + else: + failed_model_responses.append( + FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=checker_result.is_valid, + error_message=checker_result.error_message, + error_type=checker_result.error_type, + llm_response=model_result_item_raw, + decoded_result=model_result_item, + possible_answer=possible_answer_item, + ) + ) + + result = dict( + accuracy=correct_count / len(model_responses), + correct_count=correct_count, + total_count=len(model_responses), + failed_model_responses=failed_model_responses, + ) + self._save_scores(test_category, result) + return result + + def _save_scores(self, test_category, result) -> None: + if ( + (failed_model_responses := result.get('failed_model_responses')) + and isinstance(failed_model_responses[0], FailedResult) + ): + result['failed_model_responses'] = [rp.model_dump() for rp in failed_model_responses] + + file_name = self.leaderboard.get_file_name(test_category).replace('.json', '_score.json') + file_path = self.model_handler.model_dir / file_name + file_path.write_text(json.dumps(result, indent=2)) + print(f'Saved {test_category.value} evaluation result at "{file_path}".') \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/metrics.py b/berkeley-function-call-leaderboard/bfcl/evaluator/metrics.py new file mode 100644 index 000000000..d021260c9 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/metrics.py @@ -0,0 +1,75 @@ +from typing import Dict, List + +import numpy as np + +from bfcl.evaluator import constants + + +class LeaderboardModelMetrics: + def __init__(self, model_name: str) -> None: + self.model_name = model_name + self._init_metrics() + + def _init_metrics(self) -> None: + self._metrics = dict( + cost=dict(input_tokens=[], output_tokens=[]), + latency=[], + ) + + def reset(self) -> None: + self._init_metrics() + + def compute(self) -> Dict: + cost = mean_latency = std_latency = p95_latency = 'N/A' + if ( + self.model_name in constants.INPUT_PRICE_PER_MILLION_TOKEN + and len(self._metrics['cost']['input_tokens']) > 0 + and len(self._metrics['cost']['output_tokens']) > 0 + ): + mean_input_tokens = np.mean(self._metrics['cost']['input_tokens']) + mean_output_tokens = np.mean(self._metrics['cost']['output_tokens']) + cost = ( + mean_input_tokens * constants.INPUT_PRICE_PER_MILLION_TOKEN[self.model_name] + + mean_output_tokens * constants.OUTPUT_PRICE_PER_MILLION_TOKEN[self.model_name] + ) / 1000 + + if self.model_name in constants.OSS_LATENCY: + mean_latency = round(constants.OSS_LATENCY[self.model_name] / 1700, 2) + cost = mean_latency * 1000 * constants.V100_x8_PRICE_PER_HOUR / 3600 + elif len(self._metrics['latency']) != 0: + mean_latency = np.mean(self._metrics['latency']) + std_latency = np.std(self._metrics['latency']) + p95_latency = np.percentile(self._metrics['latency'], 95) + mean_latency = round(mean_latency, 2) + std_latency = round(std_latency, 2) + p95_latency = round(p95_latency, 2) + + if self.model_name not in constants.INPUT_PRICE_PER_MILLION_TOKEN: + cost = sum(self._metrics['latency']) * constants.V100_x8_PRICE_PER_HOUR / 3600 + cost = round(cost, 2) + + if self.model_name in constants.NO_COST_MODELS: + cost = 'N/A' + elif isinstance(cost, float): + cost = round(cost, 2) + + computed_metrics = dict( + cost=cost, + mean_latency=mean_latency, + std_latency=std_latency, + p95_latency=p95_latency + ) + return computed_metrics + + def __call__(self, model_responses: List[Dict]) -> None: + for response in model_responses: + if (latency := response.get('latency')): + self._metrics['latency'].append(latency) + if latency > 60: + print("*" * 100) + print(f"❗️Warning: Latency for a model '{self.model_name}' response is {latency:.4f}.") + print("*" * 100) + if (input_tokens := response.get('input_tokens')) and input_tokens != 0: + self._metrics['cost']['input_tokens'].append(input_tokens) + if (output_tokens := response.get('output_tokens')) and output_tokens != 0: + self._metrics['cost']['output_tokens'].append(output_tokens) diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py new file mode 100644 index 000000000..54c7b09f9 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py @@ -0,0 +1,88 @@ +def is_empty_output(decoded_output) -> bool: + # This function is a patch to the ast decoder for relevance detection. + # Sometimes the ast decoder will parse successfully, but the input doesn't + # really have a function call. + # [], [{}], and anything that is not in function calling format is considered + # empty (and thus should be marked as correct). + if ( + not is_function_calling_format_output(decoded_output) + or len(decoded_output) == 0 + or (len(decoded_output) == 1 and len(decoded_output[0]) == 0) + ): + return True + +def is_function_calling_format_output(decoded_output): + # Ensure the output is a list of dictionaries + if isinstance(decoded_output, list): + for item in decoded_output: + if not isinstance(item, dict): + return False + return True + return False + +def display_api_status_error(rest_error, executable_error, display_success=False): + if not rest_error and not executable_error: + if display_success: + print("🟢 All API Status Test Passed!") + return None + + RED_FONT = "\033[91m" + RESET = "\033[0m" + + print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n") + + if rest_error: + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test group (REST). Please contact API provider.\n") + print(f"{rest_error.error_rate} APIs affected:\n") + for data, status in rest_error.errors: + print(f" - Test Case: {data['ground_truth']}") + print(f" Error Type: {status['error_type']}\n") + + if executable_error: + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test group (Non-REST). Please contact API provider.\n") + print(f"{executable_error.error_rate} APIs affected:\n") + for data, status in executable_error.errors: + print(f" - Test Case: {data['ground_truth'][0]}") + print(f" Error Type: {status['error_type']}\n") + + print(f"{RED_FONT}{'-' * 100}\n{RESET}") + +def is_rest_format_output(decoded_output): + # Ensure the output is a list of one string + if type(decoded_output) == list: + if len(decoded_output) == 1 and type(decoded_output[0]) == str: + return True + return False + +def is_executable_format_output(decoded_output): + # Ensure the output is a list of strings (one or more strings) + if type(decoded_output) == list: + if len(decoded_output) == 0: + return False + for item in decoded_output: + if type(item) != str: + return False + return True + return False + +def calculate_weighted_accuracy(accuracy_dict_list): + total_count = 0 + total_accuracy = 0 + for accuracy_dict in accuracy_dict_list: + total_count += accuracy_dict["total_count"] + total_accuracy += accuracy_dict["accuracy"] * accuracy_dict["total_count"] + + if total_count == 0: + return {"accuracy": 0, "total_count": 0} + + return {"accuracy": total_accuracy / total_count, "total_count": total_count} + +def calculate_unweighted_accuracy(accuracy_dict_list): + total_accuracy = 0 + for accuracy_dict in accuracy_dict_list: + total_accuracy += accuracy_dict["accuracy"] + + if len(accuracy_dict_list) == 0: + return {"accuracy": 0, "total_count": 0} + + return {"accuracy": total_accuracy / len(accuracy_dict_list), "total_count": 0} \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/llm_generation.py b/berkeley-function-call-leaderboard/bfcl/llm_generation.py new file mode 100644 index 000000000..68d355f72 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/llm_generation.py @@ -0,0 +1,54 @@ +import json +import argparse + +from tqdm import tqdm + +from bfcl.types import Leaderboard +from bfcl.model_handler.base import ModelStyle, BaseHandler + + +def collect_model_responses( + leaderboard: Leaderboard, + model_handler: BaseHandler, + args: argparse.Namespace +) -> None: + + test_category_to_data = leaderboard.load_test_data() + get_file_name = lambda cat: leaderboard.get_file_name(cat).replace('.json', '_result.jsonl') + print('Getting model responses...') + if model_handler.model_style == ModelStyle.OSS_MODEL: + # Combine all samples to use GPUs efficiently + test_inputs = sum(test_category_to_data.values(), []) + combined_responses = model_handler.inference(inputs=test_inputs, num_gpus=args.num_gpus) + # Collect all the responses for each test category + test_category_to_responses = {} + for response in combined_responses: + test_category_to_responses.setdefault(response['test_category'], []).append(response) + # Save responses for each test category + for test_category, responses in test_category_to_responses.items(): + model_handler.write(responses, file_name=get_file_name(test_category)) + else: + # Proprietary models + for test_category, test_inputs in test_category_to_data.items(): + # Check if model responses are already available for the test category + file_name = get_file_name(test_category) + responses = model_handler.load_model_responses(file_name) + if responses is not None and len(responses) == len(test_inputs): + continue + response_ids = set(rp['id'] for rp in responses) if responses else None + file_path = model_handler.model_dir / file_name + with open(file_path, 'a+') as file: + for test_input in tqdm(test_inputs, total=len(test_inputs), desc=f'{test_category.value}'): + if response_ids and test_input['id'] in response_ids: + continue + # TODO: Handle rate limits + try: + response, metadata = model_handler.inference( + prompt=test_input['question'], + functions=test_input['function'], + test_category=test_category, + ) + row = dict(id=test_input['id'], response=response, **metadata) + file.write(json.dumps(row) + '\n') + except Exception as e: + print('Failed to get response! Error:', e) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/base.py b/berkeley-function-call-leaderboard/bfcl/model_handler/base.py new file mode 100644 index 000000000..5db2ea6d6 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/base.py @@ -0,0 +1,77 @@ +import json +from typing import List, Dict +from pathlib import Path +from enum import Enum +from abc import ABC, abstractmethod + + +class ModelStyle(str, Enum): + GORILLA = "gorilla" + OPENAI = "openai" + ANTHROPIC_FC = "claude" + ANTHROPIC_PROMPT = "claude" + MISTRAL = "mistral" + GOOGLE = "google" + COHERE = "cohere" + FIREWORK_AI = "firework_ai" + NEXUS = "nexus" + OSS_MODEL = "oss_model" + + +class BaseHandler(ABC): + model_style: str + + def __init__( + self, + model_name: str, + temperature: float = 0.7, + top_p: int = 1, + max_tokens: int = 1000, + ) -> None: + self.model_name = model_name + self.temperature = temperature + self.top_p = top_p + self.max_tokens = max_tokens + + self.result_dir = Path.cwd() / 'result' + self.result_dir.mkdir(exist_ok=True) + self.model_dir = self.result_dir / self.model_name.replace('/', '--') + self.model_dir.mkdir(exist_ok=True) + + @classmethod + @abstractmethod + def supported_models(cls) -> List[str]: + pass + + @abstractmethod + def inference(self): + """Fetch response from the model.""" + pass + + @abstractmethod + def decode_ast(self, result, language): + """Takes raw model output and converts it to the standard AST checker input.""" + pass + + @abstractmethod + def decode_execute(self, result): + """Takes raw model output and converts it to the standard execute checker input.""" + pass + + def write(self, responses: List[Dict], file_name: str) -> None: + """Write the model responses to the file.""" + + file_path = self.model_dir / file_name + with open(file_path, 'w') as file: + for response in responses: + file.write(json.dumps(response) + '\n') + print(f'Saved model responses at "{file_path}".') + + def load_model_responses(self, file_name: str) -> List[Dict] | None: + """Load the model responses if available.""" + + file_path = self.model_dir / file_name + if file_path.exists(): + with open(file_path, 'r') as f: + result = [json.loads(line) for line in f] + return result diff --git a/berkeley-function-call-leaderboard/model_handler/constant.py b/berkeley-function-call-leaderboard/bfcl/model_handler/constants.py similarity index 63% rename from berkeley-function-call-leaderboard/model_handler/constant.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/constants.py index ee34d8cff..d57d3e67f 100644 --- a/berkeley-function-call-leaderboard/model_handler/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/constants.py @@ -1,17 +1,18 @@ -USE_COHERE_OPTIMIZATION = False +import os -SYSTEM_PROMPT_FOR_CHAT_MODEL = """ - You are an expert in composing functions. You are given a question and a set of possible functions. - Based on the question, you will need to make one or more function/tool calls to achieve the purpose. - If none of the function can be used, point it out. If the given question lacks the parameters required by the function, - also point it out. You should only return the function call in tools call sections. - """ +USE_COHERE_OPTIMIZATION = os.getenv('USE_COHERE_OPTIMIZATION', False) + +SYSTEM_PROMPT_FOR_CHAT_MODEL = """\ +You are an expert in composing functions. You are given a question and a set of possible functions. +Based on the question, you will need to make one or more function/tool calls to achieve the purpose. +If none of the function can be used, point it out. If the given question lacks the parameters required by the function, +also point it out. You should only return the function call in tools call sections.""" + +USER_PROMPT_FOR_CHAT_MODEL = """\ +Questions:{user_input}\nHere is a list of functions in JSON format that you can invoke:\n{functions}. +Should you decide to return the function call(s),Put it in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)]\n +NO other text MUST be included.""" -USER_PROMPT_FOR_CHAT_MODEL = """ - Questions:{user_prompt}\nHere is a list of functions in JSON format that you can invoke:\n{functions}. - Should you decide to return the function call(s),Put it in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)]\n - NO other text MUST be included. -""" GORILLA_TO_OPENAPI = { "integer": "integer", "number": "number", @@ -144,19 +145,3 @@ "command-r-plus-FC", "command-r-plus-FC-optimized", ] - -TEST_CATEGORIES = { - "executable_simple": "gorilla_openfunctions_v1_test_executable_simple.json", - "executable_parallel_function": "gorilla_openfunctions_v1_test_executable_parallel_function.json", - "executable_multiple_function": "gorilla_openfunctions_v1_test_executable_multiple_function.json", - "executable_parallel_multiple_function": "gorilla_openfunctions_v1_test_executable_parallel_multiple_function.json", - "simple": "gorilla_openfunctions_v1_test_simple.json", - "relevance": "gorilla_openfunctions_v1_test_relevance.json", - "parallel_function": "gorilla_openfunctions_v1_test_parallel_function.json", - "multiple_function": "gorilla_openfunctions_v1_test_multiple_function.json", - "parallel_multiple_function": "gorilla_openfunctions_v1_test_parallel_multiple_function.json", - "java": "gorilla_openfunctions_v1_test_java.json", - "javascript": "gorilla_openfunctions_v1_test_javascript.json", - "rest": "gorilla_openfunctions_v1_test_rest.json", - "sql": "gorilla_openfunctions_v1_test_sql.json", -} diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py new file mode 100644 index 000000000..9c73992ba --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py @@ -0,0 +1,19 @@ +from .deepseek import DeepseekHandler +from .gemma import GemmaHandler +from .glaive import GlaiveHandler +from .hermes import HermesHandler +from .llama import LlamaHandler + +__all__ = [ + 'DeepseekHandler', + 'GemmaHandler', + 'GlaiveHandler', + 'HermesHandler', + 'LlamaHandler', +] + +MODEL_TO_HANDLER_CLS = {} +for handler_name in __all__: + handler_class = globals()[handler_name] + for model in handler_class.supported_models(): + MODEL_TO_HANDLER_CLS[model] = handler_class \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py new file mode 100644 index 000000000..f27b1d2af --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py @@ -0,0 +1,95 @@ +import json +from typing import List, Dict + +import ray +import torch +from vllm import LLM, SamplingParams + +from bfcl.model_handler import utils +from bfcl.model_handler.base import BaseHandler, ModelStyle + + +class OssModelHandler(BaseHandler): + model_style = ModelStyle.OSS_MODEL + system_message = 'You are a helpful assistant with access to the following functions. Use them if required -' + prompt_template = 'SYSTEM: {system_message}\n{functions}\nUSER: {user_input}\nASSISTANT: ' + + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) + self.sampling_params = SamplingParams( + temperature=self.temperature, + max_tokens=self.max_tokens, + top_p=self.top_p + ) + self._init_model() + + @classmethod + def supported_models(cls): + raise NotImplementedError + + def _init_model(self) -> None: + ray.init(ignore_reinit_error=True, num_cpus=8) + + def get_prompt(self, user_input, functions) -> str: + if isinstance(functions, list): + functions = json.dumps(functions) + return self.prompt_template.format( + system_message=self.system_message, + functions=functions, + user_input=user_input, + ) + + def inference(self, inputs, num_gpus) -> List[Dict]: + chunk_size = len(inputs) // num_gpus + futures = [] + for i in range(0, len(inputs), chunk_size): + futures.append( + self._batch_generate.remote( + inputs[i: i + chunk_size], + self.model_name, + self.sampling_params, + get_prompt_func=self.get_prompt, + ) + ) + responses = [] + for future in futures: + responses.extend(ray.get(future)) + return responses + + def decode_ast(self, result, language="python"): + func = result + if " " == func[0]: + func = func[1:] + if not func.startswith("["): + func = "[" + func + if not func.endswith("]"): + func = func + "]" + decode_output = utils.ast_parse(func, language) + return decode_output + + def decode_execute(self, result): + return result + + @ray.remote(num_gpus=1) + @torch.inference_mode() + def _batch_generate( + inputs, + model_path, + sampling_params: SamplingParams, + get_prompt_func + ): + prompts = [] + for _input in inputs: + test_category = _input['test_category'] + prompt = utils.augment_prompt_by_languge(_input['question'], test_category) + functions = utils.language_specific_pre_processing(_input['function'], test_category, False) + prompts.append(get_prompt_func(prompt, functions)) + + print(f'Getting responses for {len(prompts)} samples...') + llm = LLM(model=model_path, dtype='float16', trust_remote_code=True) + outputs = llm.generate(prompts, sampling_params) + responses = [ + dict(id=_input['id'], test_category=_input['test_category'], response=output.outputs[0].text) + for output, _input in zip(outputs, inputs) + ] + return responses diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py new file mode 100644 index 000000000..32f93b081 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py @@ -0,0 +1,45 @@ +import re + +from bfcl.model_handler.utils import ast_parse +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class DeepseekHandler(OssModelHandler): + system_message = ( + 'You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n' + '### Instruction:\n' + 'You are a helpful assistant with access to the following functions. Use them if required -' + ) + prompt_template = ( + '{system_message}\n' + '{functions}\n' + 'Here is the question you need to answer:\n' + '{user_input}\n' + 'Your job is to solve the above question using ONLY and strictly ONE line of python code given the above functions. If you think no function should be invoked return "[]".\n' + 'If you think one or more function should be invoked, return the function call in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)] wrapped in python code' + '### Response:\n' + ) + + @classmethod + def supported_models(cls): + return [ + 'deepseek-ai/deepseek-coder-6.7b-instruct', + ] + + def decode_ast(self, result, language="python"): + function_call = result.split("```")[1] + matches = re.findall(r"\[[^\]]*\]", function_call) + decoded_output = ast_parse(matches[0], language) + return decoded_output + + def decode_execute(self, result): + function_call = result.split("```")[1] + matches = re.findall(r"\[[^\]]*\]", function_call) + decoded_output = ast_parse(matches[0]) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py new file mode 100644 index 000000000..95a698c25 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py @@ -0,0 +1,44 @@ +import re + +from bfcl.model_handler.utils import ast_parse +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class GemmaHandler(OssModelHandler): + prompt_template = ( + 'user\n' + '{system_message}\n' + '{functions}\n' + 'Here is the question you need to answer:\n' + '{user_input}\n' + 'Your job is to solve the above question using ONLY and strictly ONE line of python code given the above functions. If you think no function should be invoked return "[]".\n' + 'If you think one or more function should be invoked, return the function call in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)] wrapped in python code' + '\n' + 'model\n' + ) + + @classmethod + def supported_models(cls): + return [ + 'google/gemma-7b-it', + ] + + def decode_ast(self, result, language="python"): + match = re.search(r"\[(.*)\]", result, re.DOTALL) + raw_input = match.group(1) + func = "[" + raw_input + "]" + decoded_output = ast_parse(func, language=language) + return decoded_output + + def decode_execute(self, result): + match = re.search(r"\[(.*)\]", result, re.DOTALL) + raw_input = match.group(1) + func = "[" + raw_input + "]" + decoded_output = ast_parse(func) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glaive.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glaive.py new file mode 100644 index 000000000..08c28915b --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glaive.py @@ -0,0 +1,32 @@ +import json + +from bfcl.model_handler.utils import convert_to_function_call +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class GlaiveHandler(OssModelHandler): + prompt_template = 'SYSTEM: {system_message}\n{functions}\nUSER: {user_input}\n' + + @classmethod + def supported_models(cls): + return [ + 'glaiveai/glaive-function-calling-v1', + ] + + def decode_ast(self, result, language="python"): + function_call = result.split("")[-1] + function_call = function_call.replace("'", "") + decoded_function = json.loads(function_call) + for key, value in decoded_function["arguments"].items(): + if language.lower() != "python": + # all values of the json are casted to string for java and javascript + decoded_function["arguments"][key] = str(decoded_function["arguments"][key]) + decoded_result = [{decoded_function["name"]: decoded_function["arguments"]}] + return decoded_result + + def decode_execute(self, result): + function_call = result.split("")[-1] + function_call = function_call.replace("'", "") + decoded_function = json.loads(function_call) + decoded_result = [{decoded_function["name"]: decoded_function["arguments"]}] + return convert_to_function_call(decoded_result) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py new file mode 100644 index 000000000..cbb316646 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py @@ -0,0 +1,96 @@ +import json + +from bfcl.model_handler.constants import GORILLA_TO_OPENAPI +from bfcl.model_handler.utils import convert_to_tool +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class HermesHandler(OssModelHandler): + prompt_template = ( + '<|im_start|>system\n' + 'You are a function calling AI model. You are provided with function signatures within XML tags. ' + "You may call one or more functions to assist with the user query. Don't make assumptions about what values to " + 'plug into functions. Here are the available tools: {functions} Use the following pydantic model ' + 'json schema for each tool call you will make: {pydantic_func_schema}. ' + 'For each function call return a json object with function name and arguments within XML tags as follows:\n' + '{{"arguments": , "name": }}<|im_end|>' + '<|im_start|>user\n{user_input}<|im_end|>' + ) + + @classmethod + def supported_models(cls): + return [ + 'NousResearch/Hermes-2-Pro-Mistral-7B', + ] + + def get_prompt(self, user_input, functions, test_category) -> str: + # Hermes use Langchain to OpenAI conversion. It does not use tool call but function call. + function = convert_to_tool(function, GORILLA_TO_OPENAPI, self.model_style, test_category, True) + pydantic_func_schema = { + "properties": { + "arguments": { + "title": "Arguments", + "type": "object" + }, + "name": { + "title": "Name", + "type": "string" + } + }, + "required": ["arguments", "name"], + "title": "FunctionCall", + "type": "object" + } + return self.prompt_template.format( + pydantic_func_schema=pydantic_func_schema, + functions=functions, + user_input=user_input, + ) + + def decode_ast(self, result, language="python"): + lines = result.split("\n") + flag = False + func_call = [] + for line in lines: + if "" == line: + flag = True + elif "" == line: + flag = False + else: + if flag: + line = line.replace("'", '"') + tool_result = json.loads(line) + if language.lower() != "python": + # all values of the json are casted to string for java and javascript + for key in tool_result["arguments"]: + tool_result["arguments"][key] = str( + tool_result["arguments"][key] + ) + func_call.append({tool_result["name"]: tool_result["arguments"]}) + flag = False + return func_call + + def decode_execute(self, result): + lines = result.split("\n") + flag = False + function_call_list = [] + for line in lines: + if "" == line: + flag = True + elif "" == line: + flag = False + else: + if flag: + line = line.replace("'", '"') + tool_result = json.loads(line) + function_call_list.append( + {tool_result["name"]: tool_result["arguments"]} + ) + flag = False + execution_list = [] + for function_call in function_call_list: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k,v in value.items()])})" + ) + return execution_list diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py new file mode 100644 index 000000000..26a1d3cb3 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py @@ -0,0 +1,45 @@ +from bfcl.model_handler import constants +from bfcl.model_handler.utils import ast_parse +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class LlamaHandler(OssModelHandler): + system_message = constants.SYSTEM_PROMPT_FOR_CHAT_MODEL + prompt_template = ( + '<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_message}<|eot_id|><|start_header_id|>' + f'user<|end_header_id|>{constants.USER_PROMPT_FOR_CHAT_MODEL}' + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>' + ) + + @classmethod + def supported_models(cls): + return [ + 'meta-llama/Meta-Llama-3-8B-Instruct', + 'meta-llama/Meta-Llama-3-70B-Instruct', + ] + + def decode_ast(self, result, language="python"): + func = result + func = func.replace("\n", "") # remove new line characters + if not func.startswith("["): + func = "[" + func + if not func.endswith("]"): + func = func + "]" + decoded_output = ast_parse(func, language) + return decoded_output + + def decode_execute(self, result): + func = result + func = func.replace("\n", "") # remove new line characters + if not func.startswith("["): + func = "[" + func + if not func.endswith("]"): + func = func + "]" + decode_output = ast_parse(func) + execution_list = [] + for function_call in decode_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/parser/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/__init__.py new file mode 100644 index 000000000..6f4d4295c --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/__init__.py @@ -0,0 +1,7 @@ +from .java_parser import parse_java_function_call +from .javascript_parser import parse_javascript_function_call + +__all__ = [ + 'parse_java_function_call', + 'parse_javascript_function_call', +] diff --git a/berkeley-function-call-leaderboard/model_handler/java_parser.py b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/java_parser.py similarity index 95% rename from berkeley-function-call-leaderboard/model_handler/java_parser.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/parser/java_parser.py index 71118c0a9..40a9ebd09 100644 --- a/berkeley-function-call-leaderboard/model_handler/java_parser.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/java_parser.py @@ -1,13 +1,8 @@ -import json from tree_sitter import Language, Parser +import tree_sitter_java -Language.build_library( - # Store the library in the `build` directory - "build/tree_sitter.so", - # Include one or more languages - ["./tree-sitter-java"], -) -JAVA_LANGUAGE = Language("build/tree_sitter.so", "java") + +JAVA_LANGUAGE = Language(tree_sitter_java.language(), "java") parser = Parser() parser.set_language(JAVA_LANGUAGE) diff --git a/berkeley-function-call-leaderboard/model_handler/js_parser.py b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/javascript_parser.py similarity index 92% rename from berkeley-function-call-leaderboard/model_handler/js_parser.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/parser/javascript_parser.py index a3b60130a..2e1f83142 100644 --- a/berkeley-function-call-leaderboard/model_handler/js_parser.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/javascript_parser.py @@ -1,10 +1,8 @@ -import json from tree_sitter import Language, Parser +import tree_sitter_javascript -# Load your language grammar and create a parser -Language.build_library("build/tree_sitter_js.so", ["./tree-sitter-javascript"]) -JS_LANGUAGE = Language("build/tree_sitter_js.so", "javascript") +JS_LANGUAGE = Language(tree_sitter_javascript.language(), "javascript") parser = Parser() parser.set_language(JS_LANGUAGE) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py new file mode 100644 index 000000000..cca90711e --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py @@ -0,0 +1,34 @@ +from .anthropic import AnthropicFCHandler, AnthropicPromptHandler +from .cohere import CohereHandler +from .databricks import DatabricksHandler +from .firework_ai import FireworkAIHandler +from .functionary import FunctionaryHandler +from .gemini import GeminiHandler +from .gorilla import GorillaHandler +from .mistral import MistralHandler +from .nexus import NexusHandler +from .nvidia import NvidiaHandler +from .openai import OpenAIHandler +from .snowflake import SnowflakeHandler + +__all__ = [ + 'AnthropicFCHandler', + 'AnthropicPromptHandler', + 'CohereHandler', + 'DatabricksHandler', + 'FireworkAIHandler', + 'FunctionaryHandler', + 'GeminiHandler', + 'GorillaHandler', + 'MistralHandler', + 'NexusHandler', + 'NvidiaHandler', + 'OpenAIHandler', + 'SnowflakeHandler', +] + +MODEL_TO_HANDLER_CLS = {} +for handler_name in __all__: + handler_class = globals()[handler_name] + for model in handler_class.supported_models(): + MODEL_TO_HANDLER_CLS[model] = handler_class \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/__init__.py new file mode 100644 index 000000000..a0c393bde --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/__init__.py @@ -0,0 +1,7 @@ +from .handler import AnthropicFCHandler +from .prompt_handler import AnthropicPromptHandler + +__all__ = [ + 'AnthropicFCHandler', + 'AnthropicPromptHandler', +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/handler.py new file mode 100644 index 000000000..09762720d --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/handler.py @@ -0,0 +1,81 @@ +import json +import time + +from anthropic.types import TextBlock, ToolUseBlock + +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import ModelStyle +from bfcl.model_handler.proprietary_model.anthropic.prompt_handler import AnthropicPromptHandler + + +class AnthropicFCHandler(AnthropicPromptHandler): + model_style = ModelStyle.ANTHROPIC_FC + + @classmethod + def supported_models(cls): + return [ + 'claude-3-opus-20240229-FC', + 'claude-3-sonnet-20240229-FC', + 'claude-3-5-sonnet-20240620-FC', + 'claude-3-haiku-20240307-FC', + ] + + def inference(self, prompt, functions, test_category): + if "FC" not in self.model_name: + return super().inference(prompt, functions, test_category) + + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, True) + if type(functions) is not list: + functions = [functions] + claude_tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_OPENAPI, self.model_style, test_category, True + ) + message = [{"role": "user", "content": prompt}] + start_time = time.time() + + response = self.client.messages.create( + model=self.model_name.strip("-FC"), + max_tokens=self.max_tokens, + tools=claude_tool, + messages=message, + ) + latency = time.time() - start_time + text_outputs = [] + tool_call_outputs = [] + for content in response.content: + if isinstance(content, TextBlock): + text_outputs.append(content.text) + elif isinstance(content, ToolUseBlock): + tool_call_outputs.append({content.name: json.dumps(content.input)}) + result = tool_call_outputs if tool_call_outputs else text_outputs[0] + return result, {"input_tokens": response.usage.input_tokens, "output_tokens": response.usage.output_tokens, "latency": latency} + + def decode_ast(self, result, language="python"): + if "FC" not in self.model_name: + decoded_output = utils.ast_parse(result,language) + else: + decoded_output = [] + for invoked_function in result: + name = list(invoked_function.keys())[0] + params = json.loads(invoked_function[name]) + if language.lower() != "python": + for key in params: + params[key] = str(params[key]) + decoded_output.append({name: params}) + return decoded_output + + def decode_execute(self, result): + if "FC" not in self.model_name: + decoded_output = utils.ast_parse(result) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list + else: + function_call = utils.convert_to_function_call(result) + return function_call diff --git a/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/prompt_handler.py similarity index 71% rename from berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/prompt_handler.py index 04ab78ef2..5d4934e6c 100644 --- a/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/prompt_handler.py @@ -1,109 +1,53 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - convert_to_tool, - ast_parse, - augment_prompt_by_languge, - language_specific_pre_processing, - construct_tool_use_system_prompt, - _function_calls_valid_format_and_invoke_extraction, - _convert_value, -) -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, - GORILLA_TO_PYTHON, -) -import os, time +import os +import time + from anthropic import Anthropic +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle + -class ClaudePromptingHandler(BaseHandler): +class AnthropicPromptHandler(BaseHandler): + model_style = ModelStyle.ANTHROPIC_PROMPT + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Anthropic_Prompt - self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) - def _get_claude_function_calling_response(self, prompt, functions, test_category): - input_tool = convert_to_tool( - functions, GORILLA_TO_PYTHON, self.model_style, test_category, True - ) - system_prompt = construct_tool_use_system_prompt(input_tool) - start = time.time() - response = self.client.messages.create( - model=self.model_name.strip("-FC"), - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, - system=system_prompt, - messages=[{"role": "user", "content": prompt}], - ) - latency = time.time() - start - result = [] - if ( - "invokes" - not in _function_calls_valid_format_and_invoke_extraction( - response.content[0].text - ).keys() - ): - return "Error", {"input_tokens": 0, "output_tokens": 0, "latency": latency} - for invoked_function in _function_calls_valid_format_and_invoke_extraction( - response.content[0].text - )["invokes"]: - name = invoked_function["tool_name"] - select_func = None - for func in input_tool: - if func["name"] == name: - select_func = func - break - if select_func is None: - result.append({}) - continue - param_dict = {} - for param in invoked_function["parameters_with_values"]: - param_name = param[0] - param_value = param[1] - try: - param_type = select_func["parameters"]["properties"][param_name][ - "type" - ] - except: - param_type = "str" - param_value = _convert_value(param_value, param_type) - param_dict[param_name] = param_value - result.append({name: param_dict}) - metadata = {} - metadata["input_tokens"] = response.usage.input_tokens - metadata["output_tokens"] = response.usage.output_tokens - metadata["latency"] = latency - return result, metadata + @classmethod + def supported_models(cls): + return [ + 'claude-instant-1.2', + 'claude-2.1', + 'claude-3-opus-20240229', + 'claude-3-sonnet-20240229', + 'claude-3-5-sonnet-20240620', + 'claude-3-haiku-20240307', + ] def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) + prompt = utils.augment_prompt_by_languge(prompt, test_category) if "FC" in self.model_name: - functions = language_specific_pre_processing(functions, test_category, True) + functions = utils.language_specific_pre_processing(functions, test_category, True) result, metadata = self._get_claude_function_calling_response( prompt, functions, test_category ) return result, metadata else: start = time.time() - functions = language_specific_pre_processing( - functions, test_category, False - ) + functions = utils.language_specific_pre_processing(functions, test_category, False) response = self.client.messages.create( model=self.model_name, max_tokens=self.max_tokens, temperature=self.temperature, top_p=self.top_p, - system=SYSTEM_PROMPT_FOR_CHAT_MODEL, + system=constants.SYSTEM_PROMPT_FOR_CHAT_MODEL, messages=[ { "role": "user", - "content": USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + "content": constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), } ], ) @@ -115,9 +59,9 @@ def inference(self, prompt, functions, test_category): result = response.content[0].text return result, metadata - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): if "FC" in self.model_name: - if language == "Python": + if language.lower() == "python": return result else: # result is a list of dictionaries, make sure each value of dictionary is string @@ -134,7 +78,7 @@ def decode_ast(self, result, language="Python"): func = "[" + func if not func.endswith("]"): func = func + "]" - decode_output = ast_parse(func, language) + decode_output = utils.ast_parse(func, language) return decode_output def decode_execute(self, result): @@ -158,7 +102,7 @@ def decode_execute(self, result): func = "[" + func if not func.endswith("]"): func = func + "]" - decode_output = ast_parse(func) + decode_output = utils.ast_parse(func) execution_list = [] for function_call in decode_output: for key, value in function_call.items(): @@ -166,3 +110,51 @@ def decode_execute(self, result): f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" ) return execution_list + + def _get_claude_function_calling_response(self, prompt, functions, test_category): + input_tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_PYTHON, self.model_style, test_category, True + ) + system_prompt = utils.construct_tool_use_system_prompt(input_tool) + start = time.time() + response = self.client.messages.create( + model=self.model_name.strip("-FC"), + max_tokens=self.max_tokens, + temperature=self.temperature, + top_p=self.top_p, + system=system_prompt, + messages=[{"role": "user", "content": prompt}], + ) + latency = time.time() - start + result = [] + out = utils.function_calls_valid_format_and_invoke_extraction(response.content[0].text) + if "invokes" not in out.keys(): + return "Error", {"input_tokens": 0, "output_tokens": 0, "latency": latency} + for invoked_function in out["invokes"]: + name = invoked_function["tool_name"] + select_func = None + for func in input_tool: + if func["name"] == name: + select_func = func + break + if select_func is None: + result.append({}) + continue + param_dict = {} + for param in invoked_function["parameters_with_values"]: + param_name = param[0] + param_value = param[1] + try: + param_type = select_func["parameters"]["properties"][param_name][ + "type" + ] + except: + param_type = "str" + param_value = utils.convert_value(param_value, param_type) + param_dict[param_name] = param_value + result.append({name: param_dict}) + metadata = {} + metadata["input_tokens"] = response.usage.input_tokens + metadata["output_tokens"] = response.usage.output_tokens + metadata["latency"] = latency + return result, metadata diff --git a/berkeley-function-call-leaderboard/model_handler/cohere_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/cohere.py similarity index 63% rename from berkeley-function-call-leaderboard/model_handler/cohere_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/cohere.py index def3be47e..74038ce02 100644 --- a/berkeley-function-call-leaderboard/model_handler/cohere_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/cohere.py @@ -1,86 +1,88 @@ import os - -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - augment_prompt_by_languge, - language_specific_pre_processing, - convert_to_tool, - ast_parse, - convert_to_function_call, -) -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, - GORILLA_TO_PYTHON, -) import time + import cohere -from model_handler.constant import USE_COHERE_OPTIMIZATION +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle -class CohereHandler(BaseHandler): - client: cohere.Client +OPTIMIZED_PREAMBLE = """## Task & Context +You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you can use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.COHERE +When a question is irrelevant or unrelated to the available tools you should choose to directly answer. This is especially important when the question or available tools are about specialist subject like math or biology or physics: DO NOT ANSWER UNRELATED QUESTIONS. - self.client = cohere.Client(api_key=os.getenv("COHERE_API_KEY")) +## Style Guide +Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. +""" - # System prompt for function calling. - if USE_COHERE_OPTIMIZATION: - self.preamble = """## Task & Context - You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you can use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. +PREAMBLE = """## Task & Context +You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. - When a question is irrelevant or unrelated to the available tools you should choose to directly answer. This is especially important when the question or available tools are about specialist subject like math or biology or physics: DO NOT ANSWER UNRELATED QUESTIONS. +## Style Guide +Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. +""" - ## Style Guide - Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. - """ - else: - self.preamble = """ - ## Task & Context - You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. - ## Style Guide - Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. - """ +class CohereHandler(BaseHandler): + model_style = ModelStyle.COHERE + + def __init__( + self, + model_name, + temperature=0.7, + top_p=1, + max_tokens=1000, + use_cohere_optimization: bool = constants.USE_COHERE_OPTIMIZATION + ) -> None: + + super().__init__(model_name, temperature, top_p, max_tokens) + self.use_cohere_optimization = use_cohere_optimization + self.client = cohere.Client(api_key=os.getenv("COHERE_API_KEY")) + self.preamble = OPTIMIZED_PREAMBLE if use_cohere_optimization else PREAMBLE + + @classmethod + def supported_models(cls): + return [ + 'command-r-plus', + 'command-r-plus-FC', + 'command-r-plus-optimized', + 'command-r-plus-FC-optimized', + ] def inference(self, prompt, functions, test_category): if "FC" not in self.model_name: - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing( + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing( functions, test_category, False ) - message = USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ) + message = constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)) start_time = time.time() response = self.client.chat( message=message, model=self.model_name, temperature=self.temperature, max_tokens=self.max_tokens, - preamble=SYSTEM_PROMPT_FOR_CHAT_MODEL, + preamble=constants.SYSTEM_PROMPT_FOR_CHAT_MODEL, ) latency = time.time() - start_time result = response.text else: - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, True) + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, True) if type(functions) is not list: functions = [functions] message = prompt # Convert JSON schema into R+ compatible function calls. - cohere_tool = convert_to_tool( - functions, GORILLA_TO_PYTHON, self.model_style, test_category, True + cohere_tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_PYTHON, self.model_style, test_category, True ) start_time = time.time() if len(cohere_tool) > 0: try: - if USE_COHERE_OPTIMIZATION: + if self.use_cohere_optimization: response = self.client.chat( message=message, model=self.model_name.replace("-FC", ""), @@ -129,13 +131,13 @@ def inference(self, prompt, functions, test_category): metadata["latency"] = latency return result, metadata - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): if "FC" not in self.model_name: if not result.startswith("["): result = "[" + result if not result.endswith("]"): result = result + "]" - decoded_output = ast_parse(result, language) + decoded_output = utils.ast_parse(result, language) else: decoded_output = [] for invoked_function in result: @@ -144,7 +146,7 @@ def decode_ast(self, result, language="Python"): if language == "Python": pass else: - if USE_COHERE_OPTIMIZATION: + if self.use_cohere_optimization: # all values of the json are cast to string for java and javascript for key, value in params.items(): value = str(value) @@ -165,7 +167,7 @@ def decode_execute(self, result): result = "[" + result if not result.endswith("]"): result = result + "]" - decoded_output = ast_parse(result) + decoded_output = utils.ast_parse(result) execution_list = [] for function_call in decoded_output: for key, value in function_call.items(): diff --git a/berkeley-function-call-leaderboard/model_handler/databricks_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/databricks.py similarity index 57% rename from berkeley-function-call-leaderboard/model_handler/databricks_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/databricks.py index fa1201c6a..5b53c5765 100644 --- a/berkeley-function-call-leaderboard/model_handler/databricks_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/databricks.py @@ -1,42 +1,41 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import language_specific_pre_processing, ast_parse -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, - GORILLA_TO_OPENAPI, -) +import os +import re import time + from openai import OpenAI -import re + +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle class DatabricksHandler(BaseHandler): + model_style = ModelStyle.OPENAI + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.model_name = model_name - self.model_style = ModelStyle.OpenAI - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens + super().__init__(model_name, temperature, top_p, max_tokens) + assert (api_key := os.getenv('DATABRICKS_API_KEY')), \ + 'Please provide your `DATABRICKS_API_KEY` in the .env file.' + assert (base_url := os.getenv('DATABRICKS_AZURE_ENDPOINT_URL')), \ + 'Please provide your `DATABRICKS_AZURE_ENDPOINT_URL` in the .env file.' + self.client = OpenAI(api_key=api_key, base_url=base_url) - # NOTE: To run the Databricks model, you need to provide your own Databricks API key and your own Azure endpoint URL. - self.client = OpenAI( - api_key="{YOUR_DATABRICKS_API_KEY}", - base_url="{YOUR_DATABRICKS_AZURE_ENDPOINT_URL}", - ) + @classmethod + def supported_models(cls): + return [ + 'databricks-dbrx-instruct', + ] def inference(self, prompt, functions, test_category): - functions = language_specific_pre_processing(functions, test_category, False) + functions = utils.language_specific_pre_processing(functions, test_category, False) if type(functions) is not list: functions = [functions] message = [ - {"role": "system", "content": SYSTEM_PROMPT_FOR_CHAT_MODEL}, + {"role": "system", "content": constants.SYSTEM_PROMPT_FOR_CHAT_MODEL}, { "role": "user", - "content": "Questions:" - + USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + "content": "Questions:" + constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), }, ] start_time = time.time() @@ -55,7 +54,7 @@ def inference(self, prompt, functions, test_category): metadata["latency"] = latency return result, metadata - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): func = re.sub(r"'([^']*)'", r"\1", result) func = func.replace("\n ", "") if not func.startswith("["): @@ -65,12 +64,12 @@ def decode_ast(self, result, language="Python"): if func.startswith("['"): func = func.replace("['", "[") try: - decode_output = ast_parse(func, language) + decode_output = utils.ast_parse(func, language) except: - decode_output = ast_parse(result, language) + decode_output = utils.ast_parse(result, language) return decode_output - def decode_execute(self, result, language="Python"): + def decode_execute(self, result, language="python"): func = re.sub(r"'([^']*)'", r"\1", result) func = func.replace("\n ", "") if not func.startswith("["): @@ -80,9 +79,9 @@ def decode_execute(self, result, language="Python"): if func.startswith("['"): func = func.replace("['", "[") try: - decode_output = ast_parse(func, language) + decode_output = utils.ast_parse(func, language) except: - decode_output = ast_parse(result, language) + decode_output = utils.ast_parse(result, language) execution_list = [] for function_call in decode_output: for key, value in function_call.items(): diff --git a/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/firework_ai.py similarity index 67% rename from berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/firework_ai.py index 74895ef73..e4643c165 100644 --- a/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/firework_ai.py @@ -1,38 +1,32 @@ -import json import os import time -from model_handler.constant import GORILLA_TO_OPENAPI -from model_handler.gpt_handler import OpenAIHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import convert_to_tool, language_specific_pre_processing from openai import OpenAI +from bfcl.model_handler.constants import GORILLA_TO_OPENAPI +from bfcl.model_handler.base import ModelStyle +from bfcl.model_handler.proprietary_model.openai import OpenAIHandler +from bfcl.model_handler.utils import convert_to_tool, language_specific_pre_processing + + class FireworkAIHandler(OpenAIHandler): - def __init__(self, model_name, temperature=0.0, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.FIREWORK_AI - self.temperature = 0.0 + model_style = ModelStyle.FIREWORK_AI + def __init__(self, model_name, temperature=0.0, top_p=1, max_tokens=1000) -> None: + super().__init__(model_name=model_name, temperature=0.0, top_p=top_p, max_tokens=max_tokens) self.client = OpenAI( base_url="https://api.fireworks.ai/inference/v1", - api_key=os.getenv("FIRE_WORKS_API_KEY"), + api_key=os.getenv("FIREWORKS_API_KEY"), ) - def write(self, result, file_to_open): - # This method is used to write the result to the file. - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists(f"./result/{self.model_name}"): - os.mkdir(f"./result/{self.model_name}") - with open( - f"./result/{self.model_name}/" - + file_to_open.replace(".json", "_result.json"), - "a+", - ) as f: - f.write(json.dumps(result) + "\n") - + @classmethod + def supported_models(cls): + return [ + 'firefunction-v1-FC', + 'firefunction-v2-FC', + ] + def inference(self, prompt, functions, test_category): functions = language_specific_pre_processing(functions, test_category, True) if type(functions) is not list: diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/functionary.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/functionary.py new file mode 100644 index 000000000..7f34a8566 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/functionary.py @@ -0,0 +1,25 @@ +from openai import OpenAI + +from bfcl.model_handler.proprietary_model.openai import OpenAIHandler + + +# For setup instructions, please refer to https://github.com/MeetKai/functionary +class FunctionaryHandler(OpenAIHandler): + def __init__( + self, + model_name: str, + temperature: float = 0.7, + top_p: int = 1, + max_tokens: int = 1000, + ) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) + self.client = OpenAI(base_url="http://localhost:8000/v1", api_key="functionary") + + @classmethod + def supported_models(cls): + return [ + 'meetkai/functionary-small-v2.2-FC', + 'meetkai/functionary-medium-v2.2-FC', + 'meetkai/functionary-small-v2.4-FC', + 'meetkai/functionary-medium-v2.4-FC', + ] diff --git a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py similarity index 61% rename from berkeley-function-call-leaderboard/model_handler/gemini_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py index 328ba399b..0707b21e6 100644 --- a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py @@ -1,25 +1,74 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - convert_to_tool, - convert_to_function_call, - augment_prompt_by_languge, - language_specific_pre_processing, -) -from model_handler.constant import GORILLA_TO_OPENAPI -import subprocess, requests, json, time +import subprocess +import time +import json +import os + +import requests + +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle class GeminiHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + model_style = ModelStyle.GOOGLE + + def __init__( + self, + model_name, + temperature=0.7, + top_p=1, + max_tokens=1000, + gcp_project_id: str | None = None + ) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Google + if gcp_project_id is None: + gcp_project_id = os.getenv('GEMINI_GCP_PROJECT_ID') + + assert gcp_project_id, ( + '`gcp_project_id` cannot be empty! To run the gemini model, you need to provide ' + 'your own GCP project ID, which can be found in the GCP console.' + ) + self.api_url = ( + f'https://us-central1-aiplatform.googleapis.com/v1beta1/projects/{gcp_project_id}/locations/us-central1/publishers/google/models/' + + self.model_name + + ":generateContent" + ) - def _query_gemini(self, user_query, functions): - """ - Query Gemini Pro model. - """ + @classmethod + def supported_models(cls): + return [ + 'gemini-1.0-pro', + 'gemini-1.5-pro-preview-0409', + 'gemini-1.5-pro-preview-0514', + 'gemini-1.5-flash-preview-0514', + ] + + def inference(self, prompt, functions, test_category): + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, True) + gemini_tool = utils.convert_to_tool(functions, constants.GORILLA_TO_OPENAPI, self.model_style, test_category, True) + result, metadata = self._query_gemini(prompt, gemini_tool) + return result, metadata + + def decode_ast(self, result, language="python"): + if type(result) is not list: + result = [result] + decoded_output = [] + for invoked_function in result: + name = list(invoked_function.keys())[0] + params = json.loads(invoked_function[name]) + if language != "Python": + for key in params: + params[key] = str(params[key]) + decoded_output.append({name: params}) + return decoded_output + + def decode_execute(self, result): + return utils.convert_to_function_call(result) + def _query_gemini(self, user_query, functions): token = subprocess.run( "gcloud auth print-access-token", check=False, @@ -41,19 +90,12 @@ def _query_gemini(self, user_query, functions): }, "tools": {"function_declarations": functions}, } - - # NOTE: To run the gemini model, you need to provide your own GCP project ID, which can be found in the GCP console. - API_URL = "https://us-central1-aiplatform.googleapis.com/v1beta1/projects/{YOUR_GCP_PROJECT_ID_HERE}/locations/us-central1/publishers/google/models/" + self.model_name + ":generateContent" headers = { "Authorization": "Bearer " + token, "Content-Type": "application/json", } start = time.time() - response = requests.post( - API_URL, - headers=headers, - data=json.dumps(json_data), - ) + response = requests.post(self.api_url, headers=headers, data=json.dumps(json_data)) latency = time.time() - start result = json.loads(response.content) if "error" in result: @@ -77,14 +119,10 @@ def _query_gemini(self, user_query, functions): parts.append(part["text"]) result = parts metatdata = {} - metatdata["input_tokens"] = json.loads(response.content)["usageMetadata"][ - "promptTokenCount" - ] - metatdata["output_tokens"] = json.loads(response.content)["usageMetadata"][ - "candidatesTokenCount" - ] + metatdata["input_tokens"] = json.loads(response.content)["usageMetadata"]["promptTokenCount"] + metatdata["output_tokens"] = json.loads(response.content)["usageMetadata"]["candidatesTokenCount"] metatdata["latency"] = latency - except Exception as e: + except Exception: result = "Parsing error: " + json.dumps(result) metatdata = { "input_tokens": 0, @@ -92,28 +130,3 @@ def _query_gemini(self, user_query, functions): "latency": latency, } return result, metatdata - - def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, True) - gemini_tool = convert_to_tool( - functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True - ) - result, metadata = self._query_gemini(prompt, gemini_tool) - return result, metadata - - def decode_ast(self, result, language="Python"): - if type(result) is not list: - result = [result] - decoded_output = [] - for invoked_function in result: - name = list(invoked_function.keys())[0] - params = json.loads(invoked_function[name]) - if language != "Python": - for key in params: - params[key] = str(params[key]) - decoded_output.append({name: params}) - return decoded_output - - def decode_execute(self, result): - return convert_to_function_call(result) diff --git a/berkeley-function-call-leaderboard/model_handler/gorilla_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gorilla.py similarity index 73% rename from berkeley-function-call-leaderboard/model_handler/gorilla_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gorilla.py index 70fe0e54a..1585cfd4c 100644 --- a/berkeley-function-call-leaderboard/model_handler/gorilla_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gorilla.py @@ -1,17 +1,51 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - ast_parse, - augment_prompt_by_languge, - language_specific_pre_processing, -) -import requests, json, re, time +import json +import time + +import requests + +from bfcl.model_handler import utils +from bfcl.model_handler.base import BaseHandler, ModelStyle class GorillaHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Gorilla + model_style = ModelStyle.GORILLA + + @classmethod + def supported_models(cls): + return [ + 'gorilla-openfunctions-v0', + 'gorilla-openfunctions-v2', + ] + + def inference(self, prompt, functions, test_category): + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, False) + if type(functions) is not list: + functions = [functions] + try: + result, metadata = self._get_gorilla_response(prompt, functions) + except KeyboardInterrupt: + raise KeyboardInterrupt + except: + result = "Error" + metadata = {"input_tokens": 0, "output_tokens": 0, "latency": 0} + return result, metadata + + def decode_ast(self, result, language="python"): + func = "[" + result + "]" + decoded_output = utils.ast_parse(func, language) + return decoded_output + + def decode_execute(self, result): + func = "[" + result + "]" + decoded_output = utils.ast_parse(func) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list def _get_gorilla_response(self, prompt, functions): requestData = { @@ -40,33 +74,3 @@ def _get_gorilla_response(self, prompt, functions): metadata["latency"] = latency directCode = jsonResponse["choices"][0]["message"]["content"] return directCode, metadata - - def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, False) - if type(functions) is not list: - functions = [functions] - try: - result, metadata = self._get_gorilla_response(prompt, functions) - except KeyboardInterrupt: - raise KeyboardInterrupt - except: - result = "Error" - metadata = {"input_tokens": 0, "output_tokens": 0, "latency": 0} - return result, metadata - - def decode_ast(self, result, language="Python"): - func = "[" + result + "]" - decoded_output = ast_parse(func, language) - return decoded_output - - def decode_execute(self, result): - func = "[" + result + "]" - decoded_output = ast_parse(func) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/mistral_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/mistral.py similarity index 63% rename from berkeley-function-call-leaderboard/model_handler/mistral_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/mistral.py index cb43a93fd..b2f5a94f5 100644 --- a/berkeley-function-call-leaderboard/model_handler/mistral_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/mistral.py @@ -1,44 +1,44 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, - GORILLA_TO_OPENAPI, -) -from model_handler.utils import ( - convert_to_tool, - ast_parse, - convert_to_function_call, - augment_prompt_by_languge, - language_specific_pre_processing, -) +import time +import os +import json + from mistralai.client import MistralClient from mistralai.models.chat_completion import ChatMessage -import os, time, json + +from bfcl.model_handler import utils, constants +from bfcl.model_handler.base import BaseHandler, ModelStyle class MistralHandler(BaseHandler): + model_style = ModelStyle.MISTRAL + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Mistral - self.client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY")) + @classmethod + def supported_models(cls): + return [ + 'mistral-tiny-2312', + 'mistral-small-2402', + 'mistral-small-2402-FC-Any', + 'mistral-small-2402-FC-Auto', + 'mistral-medium-2312', + 'mistral-large-2402', + 'mistral-large-2402-FC-Any', + 'mistral-large-2402-FC-Auto', + ] + def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) + prompt = utils.augment_prompt_by_languge(prompt, test_category) if "FC" in self.model_name: - functions = language_specific_pre_processing(functions, test_category, True) - tool = convert_to_tool( - functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True + functions = utils.language_specific_pre_processing(functions, test_category, True) + tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_OPENAPI, self.model_style, test_category, True ) - message = [ - ChatMessage(role="user", content=prompt), - ] + message = [ChatMessage(role="user", content=prompt)] start = time.time() - if "Any" in self.model_name: - tool_choice = "any" - else: - tool_choice = "auto" + tool_choice = "any" if "Any" in self.model_name else "auto" chat_response = self.client.chat( model=self.model_name.replace("-FC-Any", "").replace("-FC-Auto", ""), messages=message, @@ -56,16 +56,13 @@ def inference(self, prompt, functions, test_category): except: result = chat_response.choices[0].message.content else: - functions = language_specific_pre_processing( - functions, test_category, False - ) + functions = utils.language_specific_pre_processing(functions, test_category, False) message = [ - ChatMessage(role="system", content=SYSTEM_PROMPT_FOR_CHAT_MODEL), + ChatMessage(role="system", content=constants.SYSTEM_PROMPT_FOR_CHAT_MODEL), ChatMessage( role="user", - content=USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + content=constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), ), ] start = time.time() @@ -84,7 +81,7 @@ def inference(self, prompt, functions, test_category): } return result, metadata - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): if "FC" in self.model_name: decoded_output = [] for invoked_function in result: @@ -102,21 +99,19 @@ def decode_ast(self, result, language="Python"): func = "[" + func if not func.endswith("]"): func = func + "]" - decoded_output = ast_parse(func, language) + decoded_output = utils.ast_parse(func, language) return decoded_output def decode_execute(self, result): if "FC" in self.model_name: - function_call = convert_to_function_call(result) + function_call = utils.convert_to_function_call(result) return function_call else: func = result func = func.replace("\\_", "_") - decode_output = ast_parse(func) + decode_output = utils.ast_parse(func) execution_list = [] for function_call in decode_output: for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) + execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})") return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/nexus_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nexus.py similarity index 67% rename from berkeley-function-call-leaderboard/model_handler/nexus_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nexus.py index 5dfa8ecdb..6a6591a29 100644 --- a/berkeley-function-call-leaderboard/model_handler/nexus_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nexus.py @@ -1,32 +1,89 @@ -from model_handler.model_style import ModelStyle -from model_handler.handler import BaseHandler -from model_handler.utils import ( - ast_parse, - augment_prompt_by_languge, - language_specific_pre_processing, -) -import requests, time +import time +import requests + +from bfcl.model_handler import utils +from bfcl.model_handler.base import BaseHandler, ModelStyle + + +FUNCTION_TEMPLATE = '''Function: +def {func_name}({func_args}) -> None: + """ + {description} + + Parameters: + {param_descriptions} + """ + +''' + +OUT_OF_DOMAIN_FUNCTION = '''Function: +def out_of_domain(user_query: str) -> str: + """ + This function is designed to handle out-of-domain queries from the user. + If the user provides any input user query that is out of the domain of the other APIs provided above, + this function should be used with the input user query as the string. + + - user_query (str): The input string that is out of domain. + + Returns nothing. + """ + +''' class NexusHandler(BaseHandler): + model_style = ModelStyle.NEXUS + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: - temperature = 0.001 - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.NEXUS + super().__init__(model_name=model_name, temperature=0.001, top_p=top_p, max_tokens=max_tokens) - def generate_functions_from_dict(self, func_dicts): - func_template = """ - Function: - def {func_name}({func_args}) -> None: - \"\"\" - {description} + @classmethod + def supported_models(cls): + return [ + 'Nexusflow-Raven-v2', + ] - Parameters: - {param_descriptions} - \"\"\" + def inference(self, prompt, functions, test_category): + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, False) + raven_prompt = self._format_raven_function(prompt, functions) + result, metadata = self._query_raven(raven_prompt) + return result, metadata - """ + def decode_ast(self, result, language="python"): + if result.endswith(";"): + result = result[:-1] + result = result.replace(";", ",") + func = "[" + result + "]" + decoded_output = utils.ast_parse(func, language) + if "out_of_domain" in result: + return "irrelevant" + return decoded_output + + def decode_execute(self, result): + if result.endswith(";"): + result = result[:-1] + result = result.replace(";", ",") + func = "[" + result + "]" + decoded_output = utils.ast_parse(func) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list + + def _format_raven_function(self, user_prompt, functions): + """Nexus-Raven requires a specific format for the function description. This + function formats the function description in the required format.""" + + raven_prompt = "\n".join(self._generate_functions_from_dict(functions)) + "\n\n" + raven_prompt += "Setting: Allowed to issue multiple calls with semicolon\n" + raven_prompt += "User Query:" + user_prompt.replace("\n", "") + "" + return raven_prompt + def _generate_functions_from_dict(self, func_dicts): functions = [] for func_dict in func_dicts: func_name = func_dict['name'] @@ -43,7 +100,6 @@ def {func_name}({func_args}) -> None: param_type = f"""String[{', '.join(f"'{e}'" for e in details['enum'])}]""" param_type = param_type.replace("string", "str").replace("number", "float").replace("integer", "int").replace("object", "dict").replace("array", "list").replace("boolean", "bool") - type_hint = param_type if param in required_params: @@ -63,7 +119,7 @@ def {func_name}({func_args}) -> None: func_args = ', '.join(func_args_list) param_descriptions_str = '\n '.join(param_descriptions) - function_str = func_template.format( + function_str = FUNCTION_TEMPLATE.format( func_name=func_name, func_args=func_args, description=description, @@ -72,50 +128,16 @@ def {func_name}({func_args}) -> None: functions.append(function_str) - functions.append( - ''' - Function: - def out_of_domain(user_query: str) -> str: - """ - This function is designed to handle out-of-domain queries from the user. - If the user provides any input user query that is out of the domain of the other APIs provided above, - this function should be used with the input user query as the string. - - - user_query (str): The input string that is out of domain. - - Returns nothing. - """ - - ''') - + functions.append(OUT_OF_DOMAIN_FUNCTION) return functions - - def _format_raven_function(self, user_prompt, functions): - """ - Nexus-Raven requires a specific format for the function description. - This function formats the function description in the required format. - """ - raven_prompt = "\n".join(self.generate_functions_from_dict(functions)) + "\n\n" - raven_prompt += "Setting: Allowed to issue multiple calls with semicolon\n" - raven_prompt += "User Query:" + user_prompt.replace("\n", "") + "" - return raven_prompt - - - def _query_raven(self, prompt): - """ - Query Nexus-Raven. - """ - - API_URL = "http://nexusraven.nexusflow.ai" + api_url = "http://nexusraven.nexusflow.ai" headers = {"Content-Type": "application/json"} def query(payload): - """ - Sends a payload to a TGI endpoint. - """ - response = requests.post(API_URL, headers=headers, json=payload) + """Sends a payload to a TGI endpoint.""" + response = requests.post(api_url, headers=headers, json=payload) return response.json() start = time.time() @@ -135,34 +157,4 @@ def query(payload): call = output[0]["generated_text"].replace("Call:", "").strip() return call, {"input_tokens": 0, "output_tokens": 0, "latency": latency} - def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, False) - raven_prompt = self._format_raven_function(prompt, functions) - result, metadata = self._query_raven(raven_prompt) - return result, metadata - - def decode_ast(self, result, language="Python"): - if result.endswith(";"): - result = result[:-1] - result = result.replace(";", ",") - func = "[" + result + "]" - decoded_output = ast_parse(func, language) - if "out_of_domain" in result: - return "irrelevant" - - return decoded_output - - def decode_execute(self, result): - if result.endswith(";"): - result = result[:-1] - result = result.replace(";", ",") - func = "[" + result + "]" - decoded_output = ast_parse(func) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list + diff --git a/berkeley-function-call-leaderboard/model_handler/nvidia_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nvidia.py similarity index 56% rename from berkeley-function-call-leaderboard/model_handler/nvidia_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nvidia.py index dc49b794b..02b29418d 100644 --- a/berkeley-function-call-leaderboard/model_handler/nvidia_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nvidia.py @@ -1,42 +1,41 @@ -import time,os,json +import time +import os + from openai import OpenAI -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ast_parse -from model_handler.utils import ( - augment_prompt_by_languge, - language_specific_pre_processing, -) -from model_handler.constant import ( - USER_PROMPT_FOR_CHAT_MODEL, - SYSTEM_PROMPT_FOR_CHAT_MODEL, -) + +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle + class NvidiaHandler(BaseHandler): + model_style = ModelStyle.OPENAI + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.model_name = model_name - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens - self.model_style = ModelStyle.OpenAI + super().__init__(model_name, temperature, top_p, max_tokens) self.client = OpenAI( - base_url = "https://integrate.api.nvidia.com/v1", - api_key = os.getenv("NVIDIA_API_KEY") + base_url="https://integrate.api.nvidia.com/v1", + api_key=os.getenv("NVIDIA_API_KEY") ) + + @classmethod + def supported_models(cls): + return [ + 'nvidia/nemotron-4-340b-instruct', + ] + def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt,test_category) - functions = language_specific_pre_processing(functions,test_category,False) + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, False) message = [ { "role": "system", - "content": SYSTEM_PROMPT_FOR_CHAT_MODEL, + "content": constants.SYSTEM_PROMPT_FOR_CHAT_MODEL, }, { "role": "user", - "content": "Questions:" - + USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + "content": "Questions:" + constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), }, ] start_time = time.time() @@ -53,18 +52,8 @@ def inference(self, prompt, functions, test_category): output_token = response.usage.completion_tokens metadata = {"input_tokens": input_token, "output_tokens": output_token, "latency": latency} return result, metadata - - def write(self, result, file_to_open): - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists("./result/" + self.model_name.replace("/", "_")): - os.mkdir("./result/" + self.model_name.replace("/", "_")) - with open( - "./result/" + self.model_name.replace("/", "_") + "/" + file_to_open.replace(".json", "_result.json"), "a+" - ) as f: - f.write(json.dumps(result) + "\n") - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): result = result.replace("\n", "") if not result.startswith("["): result = "[ " + result @@ -76,10 +65,10 @@ def decode_ast(self, result, language="Python"): result = result.replace("','", ", ") if result.endswith("']"): result = result.replace("']", "]") - decode_output = ast_parse(result, language) + decode_output = utils.ast_parse(result, language) return decode_output - def decode_execute(self, result, language="Python"): + def decode_execute(self, result, language="python"): result = result.replace("\n", "") if not result.startswith("["): result = "[ " + result @@ -91,7 +80,7 @@ def decode_execute(self, result, language="Python"): result = result.replace("','", ", ") if result.endswith("']"): result = result.replace("']", "]") - decode_output = ast_parse(result, language) + decode_output = utils.ast_parse(result, language) execution_list = [] for function_call in decode_output: for key, value in function_call.items(): diff --git a/berkeley-function-call-leaderboard/model_handler/gpt_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py similarity index 65% rename from berkeley-function-call-leaderboard/model_handler/gpt_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py index f8e4de938..354f4fe98 100644 --- a/berkeley-function-call-leaderboard/model_handler/gpt_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py @@ -1,42 +1,50 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - convert_to_tool, - convert_to_function_call, - augment_prompt_by_languge, - language_specific_pre_processing, - ast_parse, -) -from model_handler.constant import ( - GORILLA_TO_OPENAPI, - GORILLA_TO_PYTHON, - USER_PROMPT_FOR_CHAT_MODEL, - SYSTEM_PROMPT_FOR_CHAT_MODEL, -) +import time +import os +import json + from openai import OpenAI -import os, time, json + +from bfcl.model_handler import utils, constants +from bfcl.model_handler.base import BaseHandler, ModelStyle class OpenAIHandler(BaseHandler): + model_style = ModelStyle.OPENAI + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.OpenAI self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) - def inference(self, prompt,functions,test_category): + @classmethod + def supported_models(cls): + return [ + 'gpt-4o-2024-05-13', + 'gpt-4o-2024-05-13-FC', + 'gpt-4-turbo-2024-04-09', + 'gpt-4-turbo-2024-04-09-FC', + 'gpt-4-1106-preview', + 'gpt-4-1106-preview-FC', + 'gpt-4-0125-preview', + 'gpt-4-0125-preview-FC', + 'gpt-4-0613', + 'gpt-4-0613-FC', + 'gpt-3.5-turbo-0125', + 'gpt-3.5-turbo-0125-FC', + ] + + def inference(self, prompt, functions, test_category): if "FC" not in self.model_name: - prompt = augment_prompt_by_languge(prompt,test_category) - functions = language_specific_pre_processing(functions,test_category,False) + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, False) message = [ { "role": "system", - "content": SYSTEM_PROMPT_FOR_CHAT_MODEL, + "content": constants.SYSTEM_PROMPT_FOR_CHAT_MODEL, }, { "role": "user", - "content": USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + "content": constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), }, ] start_time = time.time() @@ -50,13 +58,13 @@ def inference(self, prompt,functions,test_category): latency = time.time() - start_time result = response.choices[0].message.content else: - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, True) + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, True) if type(functions) is not list: functions = [functions] message = [{"role": "user", "content": prompt}] - oai_tool = convert_to_tool( - functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True + oai_tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_OPENAPI, self.model_style, test_category, True ) start_time = time.time() if len(oai_tool) > 0: @@ -90,26 +98,24 @@ def inference(self, prompt,functions,test_category): metadata["latency"] = latency return result,metadata - def decode_ast(self,result,language="Python"): + def decode_ast(self, result, language="python"): if "FC" not in self.model_name: - decoded_output = ast_parse(result,language) + decoded_output = utils.ast_parse(result,language) else: decoded_output = [] for invoked_function in result: name = list(invoked_function.keys())[0] params = json.loads(invoked_function[name]) - if language == "Python": - pass - else: + if language.lower() != "python": # all values of the json are casted to string for java and javascript for key in params: params[key] = str(params[key]) decoded_output.append({name: params}) return decoded_output - def decode_execute(self,result): + def decode_execute(self, result): if "FC" not in self.model_name: - decoded_output = ast_parse(result) + decoded_output = utils.ast_parse(result) execution_list = [] for function_call in decoded_output: for key, value in function_call.items(): @@ -118,5 +124,5 @@ def decode_execute(self,result): ) return execution_list else: - function_call = convert_to_function_call(result) + function_call = utils.convert_to_function_call(result) return function_call diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/snowflake.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/snowflake.py new file mode 100644 index 000000000..5c93b5fd5 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/snowflake.py @@ -0,0 +1,10 @@ +from bfcl.model_handler.proprietary_model.nvidia import NvidiaHandler + + +class SnowflakeHandler(NvidiaHandler): + + @classmethod + def supported_models(cls): + return [ + 'snowflake/arctic', + ] diff --git a/berkeley-function-call-leaderboard/model_handler/utils.py b/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py similarity index 90% rename from berkeley-function-call-leaderboard/model_handler/utils.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/utils.py index 4844f9fcc..a2a37c07e 100644 --- a/berkeley-function-call-leaderboard/model_handler/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py @@ -1,9 +1,11 @@ -import re, ast, builtins, ast, json -from model_handler.model_style import ModelStyle -from model_handler.constant import JAVA_TYPE_CONVERSION, JS_TYPE_CONVERSION -from model_handler.java_parser import parse_java_function_call -from model_handler.js_parser import parse_javascript_function_call -from model_handler.constant import GORILLA_TO_OPENAPI, USE_COHERE_OPTIMIZATION +import re +import ast +import builtins +import json + +from bfcl.model_handler import parser +from bfcl.model_handler.base import ModelStyle +from bfcl.model_handler import constants def _cast_to_openai_type(properties, mapping, test_category): @@ -12,7 +14,7 @@ def _cast_to_openai_type(properties, mapping, test_category): properties[key]["type"] = "string" else: var_type = value["type"] - if mapping == GORILLA_TO_OPENAPI and var_type == "float": + if mapping == constants.GORILLA_TO_OPENAPI and var_type == "float": properties[key]["format"] = "float" properties[key]["description"] += " This is a float type value." if var_type in mapping: @@ -58,13 +60,13 @@ def convert_to_tool( ): oai_tool = [] for item in functions: - if "." in item["name"] and ( - model_style == ModelStyle.OpenAI - or model_style == ModelStyle.Mistral - or model_style == ModelStyle.Google - or model_style == ModelStyle.OSSMODEL - or model_style == ModelStyle.Anthropic_FC - or model_style == ModelStyle.COHERE + if "." in item["name"] and model_style in ( + ModelStyle.OPENAI, + ModelStyle.MISTRAL, + ModelStyle.GOOGLE, + ModelStyle.ANTHROPIC_FC, + ModelStyle.COHERE, + ModelStyle.OSS_MODEL, ): # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name. item["name"] = re.sub(r"\.", "_", item["name"]) @@ -73,33 +75,29 @@ def convert_to_tool( item["parameters"]["properties"], mapping, test_category ) # When Java and Javascript, for OpenAPI compatible models, let it become string. - if ( - model_style - in [ - ModelStyle.OpenAI, - ModelStyle.Mistral, - ModelStyle.Google, - ModelStyle.Anthropic_Prompt, - ModelStyle.Anthropic_FC, - ModelStyle.FIREWORK_AI, - ModelStyle.OSSMODEL, - ModelStyle.COHERE, - ] - and stringify_parameters + if stringify_parameters and model_style in ( + ModelStyle.OPENAI, + ModelStyle.MISTRAL, + ModelStyle.GOOGLE, + ModelStyle.ANTHROPIC_FC, + ModelStyle.ANTHROPIC_PROMPT, + ModelStyle.FIREWORK_AI, + ModelStyle.COHERE, + ModelStyle.OSS_MODEL, ): properties = item["parameters"]["properties"] if test_category == "java": for key, value in properties.items(): - if value["type"] in JAVA_TYPE_CONVERSION: + if value["type"] in constants.JAVA_TYPE_CONVERSION: properties[key]["type"] = "string" elif test_category == "javascript": for key, value in properties.items(): - if value["type"] in JS_TYPE_CONVERSION: + if value["type"] in constants.JS_TYPE_CONVERSION: properties[key]["type"] = "string" - if model_style == ModelStyle.Anthropic_FC: + if model_style == ModelStyle.ANTHROPIC_FC: item["input_schema"] = item["parameters"] del item["parameters"] - if model_style == ModelStyle.Google: + if model_style == ModelStyle.GOOGLE: # Remove fields that are not supported by Gemini today. for params in item["parameters"]["properties"].values(): if "default" in params: @@ -113,7 +111,7 @@ def convert_to_tool( params["description"] += "The additional properties:" +str(params["additionalProperties"]) del params["additionalProperties"] if model_style == ModelStyle.COHERE: - if USE_COHERE_OPTIMIZATION: + if constants.USE_COHERE_OPTIMIZATION: if "required" not in item["parameters"]: item["parameters"]["required"] = [] for param_name, params in item["parameters"]["properties"].items(): @@ -181,11 +179,11 @@ def convert_to_tool( if "properties" in params: params["description"] += " Dictionary properties: " + str(params["properties"]) del params["properties"] - if model_style in [ - ModelStyle.Anthropic_Prompt, - ModelStyle.Google, - ModelStyle.OSSMODEL, - ]: + if model_style in ( + ModelStyle.ANTHROPIC_PROMPT, + ModelStyle.GOOGLE, + ModelStyle.OSS_MODEL, + ): oai_tool.append(item) elif model_style == ModelStyle.COHERE: parameter = item["parameters"]["properties"] @@ -204,11 +202,11 @@ def convert_to_tool( "parameter_definitions": parameter_definitions, } ) - elif model_style in [ - ModelStyle.OpenAI, - ModelStyle.Mistral, + elif model_style in ( + ModelStyle.OPENAI, + ModelStyle.MISTRAL, ModelStyle.FIREWORK_AI, - ]: + ): oai_tool.append({"type": "function", "function": item}) return oai_tool @@ -250,20 +248,20 @@ def convert_value(value, type_str): return value -def ast_parse(input_str, language="Python"): - if language == "Python": +def ast_parse(input_str, language="python"): + if language.lower() == "python": parsed = ast.parse(input_str, mode="eval") extracted = [] for elem in parsed.body.elts: assert isinstance(elem, ast.Call) extracted.append(resolve_ast_by_type(elem)) return extracted - elif language == "Java": - return parse_java_function_call( + elif language.lower() == "java": + return parser.parse_java_function_call( input_str[1:-1] ) # Remove the [ and ] from the string - elif language == "JavaScript": - return parse_javascript_function_call(input_str[1:-1]) + elif language.lower() == "javascript": + return parser.parse_javascript_function_call(input_str[1:-1]) else: raise NotImplementedError(f"Unsupported language: {language}") @@ -311,7 +309,7 @@ def resolve_ast_by_type(value): elif isinstance(value, ast.Name): output = value.id elif isinstance(value, ast.Call): - if len(value.keywords)==0: + if len(value.keywords) == 0: output = ast.unparse(value) else: output = resolve_ast_call(value) @@ -341,12 +339,12 @@ def augment_prompt_by_languge(prompt, test_category): return prompt -def language_specific_pre_processing(function, test_category, string_param): - if type(function) is dict: - function = [function] - if len(function) == 0: - return function - for item in function: +def language_specific_pre_processing(functions, test_category, string_param): + if isinstance(functions, (dict, str)): + functions = [functions] + if len(functions) == 0: + return functions + for item in functions: properties = item["parameters"]["properties"] if test_category == "java": for key, value in properties.items(): @@ -371,7 +369,7 @@ def language_specific_pre_processing(function, test_category, string_param): + value["type"] + " in string representation." ) - return function + return functions def construct_tool_use_system_prompt(tools): @@ -446,7 +444,7 @@ def construct_format_parameters_prompt(parameters): return constructed_prompt -def _function_calls_valid_format_and_invoke_extraction(last_completion): +def function_calls_valid_format_and_invoke_extraction(last_completion): """Check if the function call follows a valid format and extract the attempted function calls if so. Does not check if the tools actually exist or if they are called with the requisite params.""" # Check if there are any of the relevant XML tags present that would indicate an attempted function call. @@ -562,7 +560,7 @@ def _function_calls_valid_format_and_invoke_extraction(last_completion): } -def _convert_value(value, type_str): +def convert_value(value, type_str): """Convert a string value into its appropriate Python data type based on the provided type string. Arg: diff --git a/berkeley-function-call-leaderboard/bfcl/types.py b/berkeley-function-call-leaderboard/bfcl/types.py new file mode 100644 index 000000000..691fd87b0 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/types.py @@ -0,0 +1,132 @@ +import json +import hashlib +from enum import Enum +from pathlib import Path +from typing import Any, List, Dict, Type + +from pydantic import BaseModel, model_validator +from huggingface_hub import hf_hub_download + +from bfcl.utils import CustomEnum + + +class ModelType(str, Enum): + OSS = 'oss' + PROPRIETARY = 'proprietary' + +class LeaderboardNonPythonCategory(str, CustomEnum): + JAVA = 'java' + JAVASCRIPT = 'javascript' + +class LeaderboardAstCategory(str, CustomEnum): + SIMPLE = 'simple' + RELEVANCE = 'relevance' + MULTIPLE_FUNCTION = 'multiple_function' + PARALLEL_FUNCTION = 'parallel_function' + PARALLEL_MULTIPLE_FUNCTION = 'parallel_multiple_function' + JAVA = LeaderboardNonPythonCategory.JAVA.value + JAVASCRIPT = LeaderboardNonPythonCategory.JAVASCRIPT.value + +class LeaderboardExecutableCategory(str, CustomEnum): + EXECUTABLE_SIMPLE = 'executable_simple' + EXECUTABLE_PARALLEL_FUNCTION = 'executable_parallel_function' + EXECUTABLE_MULTIPLE_FUNCTION = 'executable_multiple_function' + EXECUTABLE_PARALLEL_MULTIPLE_FUNCTION = 'executable_parallel_multiple_function' + REST = 'rest' + +LeaderboardPythonCategory: Type[CustomEnum] = ( + LeaderboardAstCategory + .add(LeaderboardExecutableCategory) + .subtract(LeaderboardNonPythonCategory) + .rename('LeaderboardPythonCategory') +) + +LeaderboardCategory: Type[CustomEnum] = ( + LeaderboardPythonCategory + .add(LeaderboardNonPythonCategory) + .rename('LeaderboardCategory') + .update(dict(SQL='sql', CHATABLE='chatable')) +) + +class LeaderboardCategoryGroup(str, Enum): + AST = 'ast' + EXECUTABLE = 'executable' + NON_PYTHON = 'non_python' + PYTHON = 'python' + ALL = 'all' + +CATEGORY_GROUP_MAPPING = { + LeaderboardCategoryGroup.AST: LeaderboardAstCategory, + LeaderboardCategoryGroup.EXECUTABLE: LeaderboardExecutableCategory, + LeaderboardCategoryGroup.NON_PYTHON: LeaderboardNonPythonCategory, + LeaderboardCategoryGroup.PYTHON: LeaderboardPythonCategory, + LeaderboardCategoryGroup.ALL: LeaderboardCategory +} + +class LeaderboardVersion(str, Enum): + V1 = 'v1' + + +class Leaderboard(BaseModel): + test_group: LeaderboardCategoryGroup | None = None + test_categories: List[LeaderboardCategory] | None = None # type: ignore + version: LeaderboardVersion = LeaderboardVersion.V1 + cache_dir: Path | str = '.cache' + + @model_validator(mode='before') + @classmethod + def check_either_field_provided(cls, values): + if values.get('test_group') is not None and values.get('test_categories') is not None: + raise ValueError("Provide either 'test_group' or 'test_categories', not both") + elif values.get('test_group') is None and values.get('test_categories') is None: + raise ValueError("Provide either 'test_group' or 'test_categories'") + return values + + def model_post_init(self, __context: Any) -> None: + if self.test_group: + self.test_categories = [cat for cat in CATEGORY_GROUP_MAPPING[self.test_group]] + self.cache_dir = Path.cwd() / self.cache_dir + + @property + def test_data_cache_dir(self) -> Path: + test_data_dir = self.cache_dir / f'gorilla_openfunctions_{self.version.value}_test_data' + test_data_dir.mkdir(exist_ok=True, parents=True) + return test_data_dir + + def load_test_data(self) -> Dict[LeaderboardCategory, List[Dict]]: # type: ignore + data = {} + for test_category, infile_path in self._get_test_data(): + data[test_category] = [] + # We add `id` and `test_category` to each dataset sample + # Save the dataset in the cache with the updated keys for user reference + outfile_path = self.test_data_cache_dir / self.get_file_name(test_category) + if outfile_path.exists(): + with open(outfile_path, 'r') as file: + for line in file: + data[test_category].append(json.loads(line)) + else: + with open(infile_path, 'r') as infile, open(outfile_path, 'w') as outfile: + for line in infile: + item = json.loads(line) + item['test_category'] = test_category.value + item['id'] = self._generate_hash(json.dumps(item)) + data[test_category].append(item) + outfile.write(json.dumps(item) + '\n') + return data + + def get_file_name(self, test_category: LeaderboardCategory) -> str: # type: ignore + return f'gorilla_openfunctions_{self.version.value}_test_{test_category.value}.json' + + def _get_test_data(self): + for test_category in self.test_categories: + file_path = hf_hub_download( + repo_id='gorilla-llm/Berkeley-Function-Calling-Leaderboard', + filename=self.get_file_name(test_category), + repo_type='dataset', + cache_dir=self.cache_dir + ) + yield test_category, file_path + + def _generate_hash(self, input_str) -> str: + hash_object = hashlib.sha256(input_str.encode('utf-8')) + return hash_object.hexdigest() diff --git a/berkeley-function-call-leaderboard/bfcl/utils.py b/berkeley-function-call-leaderboard/bfcl/utils.py new file mode 100644 index 000000000..f073f3a2b --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/utils.py @@ -0,0 +1,28 @@ +from enum import Enum + + +class CustomEnum(Enum): + @classmethod + def add(cls, other): + combined_members = {member.name: member.value for member in cls} + combined_members.update({member.name: member.value for member in other}) + return __class__(cls.__name__, combined_members) + + @classmethod + def subtract(cls, other): + remaining_members = { + member.name: member.value + for member in cls if member.value not in other._value2member_map_ + } + return __class__(cls.__name__, remaining_members) + + @classmethod + def rename(cls, new_name): + members = {member.name: member.value for member in cls} + return __class__(new_name, members) + + @classmethod + def update(cls, new_members): + members = {member.name: member.value for member in cls} + members.update(new_members) + return __class__(cls.__name__, members) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/eval_checker/api_status_check_ground_truth_REST.json b/berkeley-function-call-leaderboard/data/api_status_check_ground_truth_REST.jsonl similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/api_status_check_ground_truth_REST.json rename to berkeley-function-call-leaderboard/data/api_status_check_ground_truth_REST.jsonl diff --git a/berkeley-function-call-leaderboard/eval_checker/api_status_check_ground_truth_executable.json b/berkeley-function-call-leaderboard/data/api_status_check_ground_truth_executable.jsonl similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/api_status_check_ground_truth_executable.json rename to berkeley-function-call-leaderboard/data/api_status_check_ground_truth_executable.jsonl diff --git a/berkeley-function-call-leaderboard/eval_checker/rest-eval-response_v5.jsonl b/berkeley-function-call-leaderboard/data/rest-eval-response_v5.jsonl similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/rest-eval-response_v5.jsonl rename to berkeley-function-call-leaderboard/data/rest-eval-response_v5.jsonl diff --git a/berkeley-function-call-leaderboard/eval_checker/checker.py b/berkeley-function-call-leaderboard/eval_checker/checker.py deleted file mode 100644 index 7a64bc3bf..000000000 --- a/berkeley-function-call-leaderboard/eval_checker/checker.py +++ /dev/null @@ -1,948 +0,0 @@ -from js_type_converter import js_type_converter -from java_type_converter import java_type_converter -from model_handler.constant import ( - UNDERSCORE_TO_DOT, - JAVA_TYPE_CONVERSION, - JS_TYPE_CONVERSION, -) -from eval_checker_constant import REAL_TIME_MATCH_ALLOWED_DIFFERENCE -from custom_exception import NoAPIKeyError -import re -import requests # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function. -import time -import json - -PYTHON_TYPE_MAPPING = { - "string": str, - "integer": int, - "float": float, - "boolean": bool, - "array": list, - "tuple": list, - "dict": dict, - "any": str, -} - -# This is the list of types that we need to recursively check its values -PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"] - - -NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"] - - -EVAL_GROUND_TRUTH_PATH = ( - "./rest-eval-response_v5.jsonl" # Ground truth file for v5 for rest execution -) -with open(EVAL_GROUND_TRUTH_PATH, "r") as f: - EVAL_GROUND_TRUTH = f.readlines() - - -#### Helper functions for AST #### -def find_description(func_descriptions, name): - # If func_descriptions is a list, this is the multiple or multiple_parallel case - if type(func_descriptions) == list: - for func_description in func_descriptions: - if func_description["name"] in name: - return func_description - return None - else: - # This is the parallel case, there is no need to loop through the list, as there is only one function - return func_descriptions - - -def get_possible_answer_type(possible_answer: list): - for answer in possible_answer: - if answer != "": # Optional parameter - return type(answer) - return None - - -def convert_func_name(function_name, model_name: str): - model_name_escaped = model_name.replace("_", "/") - if "." in function_name: - if model_name_escaped in UNDERSCORE_TO_DOT: - # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name. - # This happens for OpenAI, Mistral, and Google models - return re.sub(r"\.", "_", function_name) - return function_name - - -def type_checker( - param: str, - value, - possible_answer: list, - expected_type_description: str, - expected_type_converted, - nested_type_converted, -): - # NOTE: This type checker only supports nested type checking for one level deep. - # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex. - - result = { - "valid": True, - "error": [], - "is_variable": False, - "error_type": "type_error:simple", - } - - is_variable = False - # check for the case where a variable is used instead of a actual value. - # use the type in possible_answer as the expected type - possible_answer_type = get_possible_answer_type(possible_answer) - # if possible_answer only contains optional parameters, we can't determine the type - if possible_answer_type != None: - # we are being precise here. - # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer - if possible_answer_type != expected_type_converted: - is_variable = True - - # value is the same type as in function description - if type(value) == expected_type_converted: - # We don't need to do recursive check for simple types - if nested_type_converted == None: - result["is_variable"] = is_variable - return result - else: - for possible_answer_item in possible_answer: - flag = True # Each parameter should match to at least one possible answer type. - # Here, we assume that each item should be the same type. We could also relax it. - if type(possible_answer_item) == list: - for value_item in value: - checker_result = type_checker( - param, - value_item, - possible_answer_item, - str(nested_type_converted), - nested_type_converted, - None, - ) - if not checker_result["valid"]: - flag = False - break - - if flag: - return {"valid": True, "error": [], "is_variable": is_variable} - - result["valid"] = False - result["error"] = [ - f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}." - ] - result["error_type"] = "type_error:nested" - - # value is not as expected, check for the case where a variable is used instead of a actual value - # use the type in possible_answer as the expected type - possible_answer_type = get_possible_answer_type(possible_answer) - # if possible_answer only contains optional parameters, we can't determine the type - if possible_answer_type != None: - # we are being precise here. - # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer - if type(value) == possible_answer_type: - result["is_variable"] = True - return result - - result["valid"] = False - result["error"].append( - f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}." - ) - result["error_type"] = "type_error:simple" - return result - - -def standardize_string(input_string: str): - # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase - # It will also convert all the single quotes to double quotes - # This is used to compare the model output with the possible answers - # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024 - regex_string = r"[ \,\.\/\-\_\*\^]" - return re.sub(regex_string, "", input_string).lower().replace("'", '"') - - -def string_checker(param: str, model_output: str, possible_answer: list): - standardize_possible_answer = [] - standardize_model_output = standardize_string(model_output) - for i in range(len(possible_answer)): - if type(possible_answer[i]) == str: - standardize_possible_answer.append(standardize_string(possible_answer[i])) - - if standardize_model_output not in standardize_possible_answer: - return { - "valid": False, - "error": [ - f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive." - ], - "error_type": "value_error:string", - } - - return {"valid": True, "error": []} - - -def list_checker(param: str, model_output: list, possible_answer: list): - # Convert the tuple to a list - - standardize_model_output = list(model_output) - - # If the element in the list is a string, we need to standardize it - for i in range(len(standardize_model_output)): - if type(standardize_model_output[i]) == str: - standardize_model_output[i] = standardize_string(model_output[i]) - - standardize_possible_answer = [] - # We also need to standardize the possible answers - for i in range(len(possible_answer)): - standardize_possible_answer.append([]) - for j in range(len(possible_answer[i])): - if type(possible_answer[i][j]) == str: - standardize_possible_answer[i].append( - standardize_string(possible_answer[i][j]) - ) - else: - standardize_possible_answer[i].append(possible_answer[i][j]) - - if standardize_model_output not in standardize_possible_answer: - return { - "valid": False, - "error": [ - f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}." - ], - "error_type": "value_error:list/tuple", - } - - return {"valid": True, "error": []} - - -def dict_checker(param: str, model_output: dict, possible_answers: list): - # This function works for simple dictionaries, as well as dictionaries with nested dictionaries - - result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"} - for i in range(len(possible_answers)): - - if possible_answers[i] == "": - continue - - result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"} - - flag = True - - possible_answer = possible_answers[i] - # possible_anwer is a single dictionary - if len(model_output.keys()) != len(possible_answer.keys()): - result["valid"] = False - result["error"].append("Wrong number of parameters for dictionary.") - result["error_type"] = "value_error:dict_items" - flag = False - continue - - for key, value in model_output.items(): - if key not in possible_answer: - result["valid"] = False - result["error"].append(f"Unexpected parameter: '{key}'.") - result["error_type"] = "value_error:dict_key" - flag = False - break - - expected_values = possible_answer[key] - if isinstance(expected_values, dict): - result = dict_checker(param, value, [expected_values]) - if not result["valid"]: - flag = False - break - else: - standardize_value = value - # If the value is a string, we need to standardize it - if type(value) == str: - standardize_value = standardize_string(value) - # We also need to standardize the possible answers - standardize_possible_answer = [] - for i in range(len(possible_answer[key])): - if type(possible_answer[key][i]) == str: - standardize_possible_answer.append( - standardize_string(possible_answer[key][i]) - ) - else: - standardize_possible_answer.append(possible_answer[key][i]) - - if standardize_value not in standardize_possible_answer: - result["valid"] = False - result["error"].append( - f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}." - ) - result["error_type"] = "value_error:dict_value" - flag = False - break - if flag: - return {"valid": True, "error": []} - - return result - - -def list_dict_checker(param: str, model_output: list, possible_answers: list): - # This function takes in a list of dictionaries and checks if each dictionary is valid - # The order of the dictionaries in the list must match the order of the possible answers - - result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"} - - for answer_index in range(len(possible_answers)): - flag = True # True means so far, all dictionaries are valid - - # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers - if len(model_output) != len(possible_answers[answer_index]): - result["valid"] = False - result["error"] = ["Wrong number of dictionaries in the list."] - result["error_type"] = "value_error:list_dict_count" - flag = False - continue - - for dict_index in range(len(model_output)): - result = dict_checker( - param, - model_output[dict_index], - [possible_answers[answer_index][dict_index]], - ) - if not result["valid"]: - flag = False - break - if flag: - return {"valid": True, "error": []} - - return result - - -def simple_function_checker( - func_description: dict, - model_output: dict, - possible_answer: dict, - language: str, - model_name: str, -): - possible_answer = list(possible_answer.values())[0] - # Extract function name and parameters details - func_name = func_description["name"] - param_details = func_description["parameters"]["properties"] - required_params = func_description["parameters"]["required"] - - # Initialize a result dictionary - result = { - "valid": True, - "error": [], - "error_type": "simple_function_checker:unclear", - } - - func_name = convert_func_name(func_name, model_name) - - # Check if function name matches - if func_name not in model_output: - result["valid"] = False - result["error"].append( - f"Function name {repr(func_name)} not found in model output." - ) - result["error_type"] = "simple_function_checker:wrong_func_name" - return result - - model_params = model_output[func_name] - - # Check for required parameters in model output - for param in required_params: - if param not in model_params: - result["valid"] = False - result["error"].append(f"Missing required parameter: {repr(param)}.") - result["error_type"] = "simple_function_checker:missing_required" - return result - - # Validate types and values for each parameter in model output - for param, value in model_params.items(): - if param not in param_details or param not in possible_answer: - result["valid"] = False - result["error"].append(f"Unexpected parameter: {repr(param)}.") - result["error_type"] = "simple_function_checker:unexpected_param" - return result - - full_param_details = param_details[param] - expected_type_description = full_param_details["type"] # This is a string - is_variable = False - nested_type_converted = None - - if language == "Java": - expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description] - - if expected_type_description in JAVA_TYPE_CONVERSION: - if expected_type_description in NESTED_CONVERSION_TYPE_LIST: - nested_type = param_details[param]["items"]["type"] - nested_type_converted = JAVA_TYPE_CONVERSION[nested_type] - value = java_type_converter( - value, expected_type_description, nested_type - ) - else: - value = java_type_converter(value, expected_type_description) - - elif language == "JavaScript": - expected_type_converted = JS_TYPE_CONVERSION[expected_type_description] - - if expected_type_description in JS_TYPE_CONVERSION: - if expected_type_description in NESTED_CONVERSION_TYPE_LIST: - nested_type = param_details[param]["items"]["type"] - nested_type_converted = JS_TYPE_CONVERSION[nested_type] - value = js_type_converter( - value, expected_type_description, nested_type - ) - else: - value = js_type_converter(value, expected_type_description) - - elif language == "Python": - expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description] - if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST: - nested_type = param_details[param]["items"]["type"] - nested_type_converted = PYTHON_TYPE_MAPPING[nested_type] - - # We convert all tuple value to list when the expected type is tuple. - # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load(). - # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future. - if expected_type_description == "tuple" and type(value) == tuple: - value = list(value) - - # Allow python auto conversion from int to float - if ( - language == "Python" - and expected_type_description == "float" - and type(value) == int - ): - value = float(value) - - # Type checking - # In fact, we only check for Python here. - # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct. - type_check_result = type_checker( - param, - value, - possible_answer[param], - expected_type_description, - expected_type_converted, - nested_type_converted, - ) - is_variable = type_check_result["is_variable"] - if not type_check_result["valid"]: - return type_check_result - - # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable. - # We can just treat the variable as a string and use the normal flow. - if not is_variable: - # Special handle for dictionaries - if expected_type_converted == dict: - result = dict_checker(param, value, possible_answer[param]) - if not result["valid"]: - return result - continue - - # Special handle for list of dictionaries - elif expected_type_converted == list and nested_type_converted == dict: - result = list_dict_checker(param, value, possible_answer[param]) - if not result["valid"]: - return result - continue - - # Special handle for strings - elif expected_type_converted == str: - # We don't check for case sensitivity for string, as long as it's not a variable - result = string_checker(param, value, possible_answer[param]) - if not result["valid"]: - return result - continue - - elif expected_type_converted == list: - result = list_checker(param, value, possible_answer[param]) - if not result["valid"]: - return result - continue - - # Check if the value is within the possible answers - if value not in possible_answer[param]: - result["valid"] = False - result["error"].append( - f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}." - ) - result["error_type"] = "value_error:others" - return result - - # Check for optional parameters not provided but allowed - for param in possible_answer: - if param not in model_params and "" not in possible_answer[param]: - result["valid"] = False - result["error"].append( - f"Optional parameter {repr(param)} not provided and not marked as optional." - ) - result["error_type"] = "simple_function_checker:missing_optional" - return result - - return result - - -def parallel_function_checker_enforce_order( - func_descriptions: list, - model_output: list, - possible_answers: dict, - language: str, - model_name: str, -): - if len(model_output) != len(possible_answers): - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "parallel_function_checker_enforce_order:wrong_count", - } - - func_name_list = list(possible_answers.keys()) - possible_answers_list = [] - - for key, value in possible_answers.items(): - possible_answers_list.append({key: value}) - - for i in range(len(possible_answers_list)): - func_description = find_description(func_descriptions, func_name_list[i]) - if func_description is None: - return { - "valid": False, - "error": [ - f"Function doc description not found for function name: {repr(func_name_list[i])}." - ], - "error_type": "parallel_function_checker_enforce_order:cannot_find_description", - } - result = simple_function_checker( - func_description, - model_output[i], - possible_answers_list[i], - language, - model_name, - ) - if not result["valid"]: - return result - - return {"valid": True, "error": []} - - -def parallel_function_checker_no_order( - func_descriptions: list, - model_output: list, - possible_answers: dict, - language: str, - model_name: str, -): - if len(model_output) != len(possible_answers): - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "parallel_function_checker_no_order:wrong_count", - } - - func_name_list = list(possible_answers.keys()) - possible_answers_list = [] - - for key, value in possible_answers.items(): - possible_answers_list.append({key: value}) - - matched_indices = [] - - # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer - # It must be this way because we need ground truth to fetch the correct function description - for i in range(len(possible_answers_list)): - func_description = find_description(func_descriptions, func_name_list[i]) - - # This should not happen. As possible_answers is the ground truth, and it should have the correct function name. - if func_description is None: - return { - "valid": False, - "error": [ - f"Function doc description not found for function name: {repr(func_name_list[i])}." - ], - "error_type": "parallel_function_checker_no_order:cannot_find_description", - } - - all_errors = [] - - for index in range(len(model_output)): - if index in matched_indices: - continue - - result = simple_function_checker( - func_description, - model_output[index], - possible_answers_list[i], - language, - model_name, - ) - - if result["valid"]: - matched_indices.append(index) - break - else: - all_errors.append( - { - f"Model Result Index {index}": { - "sub_error": result["error"], - "sub_error_type": result["error_type"], - "model_output_item": model_output[index], - "possible_answer_item": possible_answers_list[i], - } - } - ) - - if not result["valid"]: - considered_indices = [ - i for i in range(len(model_output)) if i not in matched_indices - ] - all_errors.insert( - 0, - f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.", - ) - return { - "valid": False, - "error": all_errors, - "error_type": "parallel_function_checker_no_order:cannot_find_match", - } - - return {"valid": True, "error": []} - - -def patten_matcher(exec_output, expected_result, function_call, is_sanity_check): - result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} - - if type(exec_output) != type(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type", - "model_executed_output": exec_output, - } - if type(exec_output) == dict: - # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one. - # This happens when the key is a timestamp or a random number. - if is_sanity_check: - if len(exec_output) != len(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type:dict_length", - "model_executed_output": exec_output, - } - else: - return result - - for key, value in expected_result.items(): - if key not in exec_output: - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output." - ], - "error_type": "executable_checker:wrong_result_type:dict_key_not_found", - "model_executed_output": exec_output, - } - for key, value in exec_output.items(): - if key not in expected_result: - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output." - ], - "error_type": "executable_checker:wrong_result_type:dict_extra_key", - "model_executed_output": exec_output, - } - if type(exec_output) == list: - if len(exec_output) != len(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type:list_length", - "model_executed_output": exec_output, - } - return result - - -#### Helper functions for Exec #### -def executable_checker_simple( - function_call: str, - expected_result, - expected_result_type: str, - is_sanity_check=False, -): - result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} - - exec_dict = {} - - try: - exec( - "from executable_python_function import *" + "\nresult=" + function_call, - exec_dict, - ) - exec_output = exec_dict["result"] - except NoAPIKeyError as e: - raise e - except Exception as e: - result["valid"] = False - result["error"].append( - f"Error in execution: {repr(function_call)}. Error: {str(e)}" - ) - result["error_type"] = "executable_checker:execution_error" - return result - - # We need to special handle the case where the execution result is a tuple and convert it to a list - # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json - if isinstance(exec_output, tuple): - exec_output = list(exec_output) - - if expected_result_type == "exact_match": - if exec_output != expected_result: - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}." - ) - result["error_type"] = "executable_checker:wrong_result" - result["model_executed_output"] = exec_output - return result - - elif expected_result_type == "real_time_match": - # Allow for 5% difference - if (type(expected_result) == float or type(expected_result) == int) and ( - type(exec_output) == float or type(exec_output) == int - ): - if not ( - expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE) - <= exec_output - <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE) - ): - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed." - ) - result["error_type"] = "executable_checker:wrong_result_real_time" - result["model_executed_output"] = exec_output - return result - else: - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria." - ) - result["error_type"] = "executable_checker:wrong_result_real_time" - result["model_executed_output"] = exec_output - return result - - else: - # structural match - pattern_match_result = patten_matcher( - exec_output, expected_result, function_call, is_sanity_check - ) - if not pattern_match_result["valid"]: - return pattern_match_result - - return result - - -def executable_checker_parallel_no_order( - decoded_result: list, expected_exec_result: list, expected_exec_result_type: list -): - - if len(decoded_result) != len(expected_exec_result): - return { - "valid": False, - "error": [ - f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}." - ], - "error_type": "value_error:exec_result_count", - } - - matched_indices = [] - for i in range(len(expected_exec_result)): - all_errors = [] - for index in range(len(decoded_result)): - if index in matched_indices: - continue - - result = executable_checker_simple( - decoded_result[index], - expected_exec_result[i], - expected_exec_result_type[i], - False, - ) - - if result["valid"]: - matched_indices.append(index) - break - else: - all_errors.append( - { - f"Model Result Index {index}": { - "sub_error": result["error"], - "sub_error_type": result["error_type"], - "model_executed_output": ( - result["model_executed_output"] - if "model_executed_output" in result - else None - ), - } - } - ) - - if not result["valid"]: - considered_indices = [ - i for i in range(len(decoded_result)) if i not in matched_indices - ] - all_errors.insert( - 0, - f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.", - ) - return { - "valid": False, - "error": all_errors, - "error_type": "executable_checker:cannot_find_match", - } - - return {"valid": True, "error": [], "error_type": "executable_checker:unclear"} - - -#### Main function #### -def executable_checker_rest(func_call, idx): - if "https://geocode.maps.co" in func_call: - time.sleep(2) - if "requests_get" in func_call: - func_call = func_call.replace("requests_get", "requests.get") - try: - response = eval(func_call) - except Exception as e: - return { - "valid": False, - "error": [f"Execution failed. {str(e)}"], - "error_type": "executable_checker_rest:execution_error", - } - - try: - if response.status_code == 200: - - eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx]) - try: - if isinstance(eval_GT_json, dict): - if isinstance(response.json(), dict): - if set(eval_GT_json.keys()) == set(response.json().keys()): - return {"valid": True, "error": [], "error_type": ""} - return { - "valid": False, - "error": ["Key inconsistency"], - "error_type": "executable_checker_rest:wrong_key", - } - return { - "valid": False, - "error": [ - f"Expected dictionary, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } - - elif isinstance(eval_GT_json, list): - if isinstance(response.json(), list): - if len(eval_GT_json) != len(response.json()): - return { - "valid": False, - "error": [f"Response list length inconsistency."], - "error_type": "value_error:exec_result_rest_count", - } - - else: - for i in range(len(eval_GT_json)): - if set(eval_GT_json[i].keys()) != set( - response.json()[i].keys() - ): - return { - "valid": False, - "error": [f"Key inconsistency"], - "error_type": "executable_checker_rest:wrong_key", - } - - return {"valid": True, "error": []} - else: - return { - "valid": False, - "error": [ - f"Expected list, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } - return { - "valid": False, - "error": [ - f"Expected dict or list, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } - except Exception as e: - return { - "valid": False, - "error": [ - f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}" - ], - "error_type": "executable_checker_rest:response_format_error", - } - else: - return { - "valid": False, - "error": [ - f"Execution result status code is not 200, got {response.status_code}" - ], - "error_type": "executable_checker_rest:wrong_status_code", - } - except Exception as e: - return { - "valid": False, - "error": [f"Cannot get status code of the response. Error: {str(e)}"], - "error_type": "executable_checker_rest:cannot_get_status_code", - } - - -def ast_checker( - func_description, model_output, possible_answer, language, test_category, model_name -): - if "multiple" in test_category or "parallel" in test_category: - # Some formatting issues that needs to be handled - if test_category == "parallel_function": - func_description = [func_description] - - return parallel_function_checker_no_order( - func_description, model_output, possible_answer, language, model_name - ) - - else: - if len(model_output) != 1: - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "simple_function_checker:wrong_count", - } - model_output = model_output[0] - return simple_function_checker( - func_description, model_output, possible_answer, language, model_name - ) - - -def exec_checker(decoded_result: list, func_description: dict, test_category: str): - if "multiple" in test_category or "parallel" in test_category: - return executable_checker_parallel_no_order( - decoded_result, - func_description["execution_result"], - func_description["execution_result_type"], - ) - - else: - if len(decoded_result) != 1: - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "simple_exec_checker:wrong_count", - } - return executable_checker_simple( - decoded_result[0], - func_description["execution_result"][0], - func_description["execution_result_type"][0], - False, - ) diff --git a/berkeley-function-call-leaderboard/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/eval_checker/custom_exception.py deleted file mode 100644 index e30fe81c5..000000000 --- a/berkeley-function-call-leaderboard/eval_checker/custom_exception.py +++ /dev/null @@ -1,10 +0,0 @@ -class NoAPIKeyError(Exception): - def __init__(self): - self.message = "Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." - super().__init__(self.message) - - -class BadAPIStatusError(Exception): - def __init__(self, message): - self.message = message - super().__init__(self.message) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_checker_constant.py b/berkeley-function-call-leaderboard/eval_checker/eval_checker_constant.py deleted file mode 100644 index fe11bcead..000000000 --- a/berkeley-function-call-leaderboard/eval_checker/eval_checker_constant.py +++ /dev/null @@ -1,18 +0,0 @@ -REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2 - -FILENAME_INDEX_MAPPING = { - "executable_parallel_function": (0, 49), - "parallel_multiple_function": (50, 249), - "executable_simple": (250, 349), - "rest": (350, 419), - "sql": (420, 519), - "parallel_function": (520, 719), - "chatable": (720, 919), - "java": (920, 1019), - "javascript": (1020, 1069), - "executable_multiple_function": (1070, 1119), - "simple": (1120, 1519), - "relevance": (1520, 1759), - "executable_parallel_multiple_function": (1760, 1799), - "multiple_function": (1800, 1999), -} diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/eval_checker/eval_runner.py deleted file mode 100644 index ec0b557c1..000000000 --- a/berkeley-function-call-leaderboard/eval_checker/eval_runner.py +++ /dev/null @@ -1,518 +0,0 @@ -import sys - -sys.path.append("../") - -from checker import ast_checker, exec_checker, executable_checker_rest -from eval_runner_helper import * -from tqdm import tqdm -import argparse - - -# NOTE: This file should be run in the `eval_checker` directory - - -def single_executable_file_runner( - handler, model_result, prompt, model_name, test_category -): - assert len(model_result) == len(prompt) - - result = [] - correct_count = 0 - for i in tqdm(range(len(model_result)), desc="Running tests"): - raw_result = model_result[i]["result"] - try: - decoded_result = handler.decode_execute(raw_result) - except Exception as e: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [f"Failed to decode executable. {str(e)}"], - "error_type": "executable_decoder:decoder_failed", - "prompt": prompt[i], - "model_result_raw": raw_result, - } - ) - continue - - if "rest" in test_category: - # REST is always single-functioned. Therefore we take the first one and pass it to the REST checker. - if not is_rest_format_output(decoded_result): - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "executable_decoder:rest_wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(raw_result), - "model_result_decoded": str(decoded_result), - } - ) - continue - - checker_result = executable_checker_rest(decoded_result[0], i) - - else: - if not is_executable_format_output(decoded_result): - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "executable_decoder:wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(raw_result), - "model_result_decoded": str(decoded_result), - } - ) - continue - - prompt_item = prompt[i] - checker_result = exec_checker(decoded_result, prompt_item, test_category) - - if checker_result["valid"]: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = checker_result["valid"] - temp["error"] = checker_result["error"] - temp["error_type"] = checker_result["error_type"] - temp["prompt"] = prompt[i] - temp["model_result_raw"] = raw_result - temp["model_result_decoded"] = decoded_result - if "model_executed_output" in checker_result: - temp["model_executed_output"] = checker_result["model_executed_output"] - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -def single_relevance_file_runner(handler, model_result, model_name, test_category): - - result = [] - correct_count = 0 - for i in range(len(model_result)): - model_result_item = model_result[i]["result"] - success = False - decoded_result = None - - try: - decoded_result = handler.decode_ast(model_result_item, language="Python") - success = False - if is_empty_output(decoded_result): - success = True - - except Exception as e: - success = True - - if success: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = success - temp["error"] = [ - f"Valid syntax. Successfully decode AST when it should not." - ] - temp["error_type"] = "relevance_error:decoder_success" - temp["model_result"] = model_result_item - temp["decoded_result"] = decoded_result - - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -def single_ast_file_runner( - handler, model_result, prompt, possible_answer, language, test_category, model_name -): - assert ( - len(model_result) == len(prompt) == len(possible_answer) - ), "The length of the model result does not match the length of the prompt or possible answer. Please check the input files for completeness." - - result = [] - correct_count = 0 - for i in range(len(model_result)): - model_result_item = model_result[i]["result"] - prompt_item = prompt[i]["function"] - possible_answer_item = possible_answer[i] - - try: - model_result_item_raw = model_result_item - model_result_item = handler.decode_ast(model_result_item, language) - except Exception as e: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [f"Invalid syntax. Failed to decode AST. {str(e)}"], - "error_type": "ast_decoder:decoder_failed", - "prompt": prompt[i], - "model_result_raw": model_result_item_raw, - "possible_answer": possible_answer_item, - } - ) - continue - - decoder_output_valid = is_function_calling_format_output(model_result_item) - if not decoder_output_valid: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "ast_decoder:decoder_wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(model_result_item_raw), - "model_result_decoded": str(model_result_item), - "possible_answer": possible_answer_item, - } - ) - continue - - checker_result = ast_checker( - prompt_item, - model_result_item, - possible_answer_item, - language, - test_category, - model_name, - ) - - if checker_result["valid"]: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = checker_result["valid"] - temp["error"] = checker_result["error"] - temp["error_type"] = checker_result["error_type"] - temp["prompt"] = prompt[i] - temp["model_result_raw"] = model_result_item_raw - temp["model_result_decoded"] = model_result_item - temp["possible_answer"] = possible_answer_item - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -#### Main runner function #### -def runner(model_names, test_categories, api_sanity_check): - - # A flag to indicate if the API has been tested. - # We should always test the API with ground truth first before running the executable tests. - # Sometimes the API may not be working as expected and we want to catch that before running the evaluation to ensure the results are accurate. - API_TESTED = False - - # Before running the executable evaluation, we need to get the expected output from the ground truth. - # So we need a list of all the test categories that we have ran the ground truth evaluation on. - # We only get the expected output once for each test category. - EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = [] - - # Get a list of all entries in the folder - entries = os.scandir(INPUT_PATH) - - # Filter out the subdirectories - subdirs = [entry.path for entry in entries if entry.is_dir()] - - # Traverse each subdirectory - for subdir in subdirs: - - model_name = subdir.split(INPUT_PATH)[1] - if model_names is not None and model_name not in model_names: - continue - - model_name_escaped = model_name.replace("_", "/") - - files = [ - f - for f in os.listdir(subdir) - if os.path.isfile(os.path.join(subdir, f)) and not f.startswith(".") - ] - # Check if there is only one file and that file is 'result.json' - # If so, this is an OSS model result file and we need to special process it first - if len(files) == 1 and files[0] == "result.json": - result_json_file_path = os.path.join(subdir, "result.json") - oss_file_formatter(result_json_file_path, subdir) - print( - f"Detected OSS model: {model_name}. result.json has been split into individual test category files." - ) - - # Pattern to match JSON files in this subdirectory - json_files_pattern = os.path.join(subdir, "*.json") - - print(f"🦍 Model: {model_name}") - - # Find and process all JSON files in the subdirectory - for model_result_json in glob.glob(json_files_pattern): - - if os.path.basename(model_result_json) == "result.json": - continue - - test_category = extract_after_test(model_result_json) - if test_categories is not None and test_category not in test_categories: - continue - - handler = get_handler(model_name_escaped) - - # We don't evaluate chatable and SQL models in our current leaderboard - if is_chatable(test_category) or is_sql(test_category): - continue - - language = "Python" - if is_java(test_category): - language = "Java" - if is_js(test_category): - language = "JavaScript" - - print(f"🔍 Running test: {test_category}") - - model_result = load_file(model_result_json) - record_cost_latency(LEADERBOARD_TABLE, model_name, model_result) - - if is_relevance(test_category): - accuracy, total_count = single_relevance_file_runner( - handler, model_result, model_name, test_category - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - continue - - # Find the corresponding test file - prompt_file = find_file_with_suffix(PROMPT_PATH, test_category) - prompt = load_file(prompt_file) - - if is_executable(test_category): - # We only test the API with ground truth once - if not API_TESTED and api_sanity_check: - print("---- Sanity checking API status ----") - api_status_sanity_check_rest() - api_status_sanity_check_executable() - print("---- Sanity check Passed 💯 ----") - API_TESTED = True - - if ( - test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN - and not is_rest(test_category) - ): - print( - f"---- Getting real-time execution result from ground truth for {test_category} ----" - ) - get_executable_expected_output(prompt_file) - print( - f"---- Ground truth real-time execution result obtained for {test_category} 🌟 ----" - ) - EXECUTABLE_TEST_CATEGORIES_HAVE_RUN.append(test_category) - # Need to re-load the prompt file after getting the expected output, as the prompt file has been updated - prompt = load_file(prompt_file) - - accuracy, total_count = single_executable_file_runner( - handler, model_result, prompt, model_name, test_category - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - - continue - - # Find the corresponding possible answer file - possible_answer_file = find_file_with_suffix( - POSSIBLE_ANSWER_PATH, test_category - ) - possible_answer = load_file(possible_answer_file) - accuracy, total_count = single_ast_file_runner( - handler, - model_result, - prompt, - possible_answer, - language, - test_category, - model_name, - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - - # This function reads all the score files from local folder and updates the leaderboard table. - # This is helpful when you only want to run the evaluation for a subset of models and test categories. - update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH) - # Write the leaderboard table to a file - generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH) - - # Clean up the executable expected output files - # They should be re-generated the next time the evaluation is run - clean_up_executable_expected_output( - PROMPT_PATH, EXECUTABLE_TEST_CATEGORIES_HAVE_RUN - ) - - -ARG_PARSE_MAPPING = { - "ast": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "java", - "javascript", - "relevance", - ], - "executable": [ - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], - "all": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "java", - "javascript", - "relevance", - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], - "non-python": [ - "java", - "javascript", - ], - "python": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "relevance", - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], -} - - -INPUT_PATH = "../result/" -PROMPT_PATH = "../data/" -POSSIBLE_ANSWER_PATH = "../data/possible_answer/" -OUTPUT_PATH = "../score/" - -# A dictionary to store the results -# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count -LEADERBOARD_TABLE = {} - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Process two lists of strings.") - - # Add arguments for two lists of strings - parser.add_argument( - "--model", nargs="+", type=str, help="A list of model names to evaluate" - ) - parser.add_argument( - "--test-category", - nargs="+", - type=str, - help="A list of test categories to run the evaluation on", - ) - parser.add_argument( - "-s", - "--skip-api-sanity-check", - action="store_false", - default=True, # Default value is True, meaning the sanity check is performed unless the flag is specified - help="Skip the REST API status sanity check before running the evaluation. By default, the sanity check is performed.", - ) - - args = parser.parse_args() - - api_sanity_check = args.skip_api_sanity_check - test_categories = None - if args.test_category is not None: - test_categories = [] - for test_category in args.test_category: - if test_category in ARG_PARSE_MAPPING: - test_categories.extend(ARG_PARSE_MAPPING[test_category]) - else: - test_categories.append(test_category) - - model_names = args.model - if args.model is not None: - model_names = [] - for model_name in args.model: - # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. - # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). - # We patch it here to avoid confusing the user. - model_names.append(model_name.replace("/", "_")) - - runner(model_names, test_categories, api_sanity_check) diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py deleted file mode 100644 index a97e2ca55..000000000 --- a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py +++ /dev/null @@ -1,998 +0,0 @@ -import glob -import json -import os -import statistics -import subprocess - -import numpy as np -from custom_exception import BadAPIStatusError -from model_handler.handler_map import handler_map -from tqdm import tqdm -from eval_checker_constant import FILENAME_INDEX_MAPPING - -REST_API_GROUND_TRUTH_FILE_PATH = "api_status_check_ground_truth_REST.json" -EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = "api_status_check_ground_truth_executable.json" - -COLUMNS = [ - "Rank", - "Overall Acc", - "Model", - "Model Link", - "Organization", - "License", - "AST Summary", - "Exec Summary", - "Simple Function AST", - "Python Simple Function AST", - "Java Simple Function AST", - "JavaScript Simple Function AST", - "Multiple Functions AST", - "Parallel Functions AST", - "Parallel Multiple AST", - "Simple Function Exec", - "Python Simple Function Exec", - "REST Simple Function Exec", - "Multiple Functions Exec", - "Parallel Functions Exec", - "Parallel Multiple Exec", - "Relevance Detection", - "Cost ($ Per 1k Function Calls)", - "Latency Mean (s)", - "Latency Standard Deviation (s)", - "Latency 95th Percentile (s)", -] - -MODEL_METADATA_MAPPING = { - "gpt-4o-2024-05-13-FC": [ - "GPT-4o-2024-05-13 (FC)", - "https://openai.com/index/hello-gpt-4o/", - "OpenAI", - "Proprietary", - ], - "gpt-4o-2024-05-13": [ - "GPT-4o-2024-05-13 (Prompt)", - "https://openai.com/index/hello-gpt-4o/", - "OpenAI", - "Proprietary", - ], - "gpt-4-1106-preview-FC": [ - "GPT-4-1106-Preview (FC)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-1106-preview": [ - "GPT-4-1106-Preview (Prompt)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-0125-preview-FC": [ - "GPT-4-0125-Preview (FC)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-0125-preview": [ - "GPT-4-0125-Preview (Prompt)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-turbo-2024-04-09-FC": [ - "GPT-4-turbo-2024-04-09 (FC)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-turbo-2024-04-09": [ - "GPT-4-turbo-2024-04-09 (Prompt)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gorilla-openfunctions-v2": [ - "Gorilla-OpenFunctions-v2 (FC)", - "https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html", - "Gorilla LLM", - "Apache 2.0", - ], - "claude-3-opus-20240229-FC": [ - "Claude-3-Opus-20240229 (FC tools-2024-04-04)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-opus-20240229": [ - "Claude-3-Opus-20240229 (Prompt)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "mistral-medium-2312": [ - "Mistral-Medium-2312 (Prompt)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-small-2402": [ - "Mistral-Small-2402 (Prompt)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-large-2402": [ - "Mistral-Large-2402 (Prompt)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "claude-3-sonnet-20240229-FC": [ - "Claude-3-Sonnet-20240229 (FC tools-2024-04-04)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-sonnet-20240229": [ - "Claude-3-Sonnet-20240229 (Prompt)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-haiku-20240307-FC": [ - "Claude-3-Haiku-20240307 (FC tools-2024-04-04)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-haiku-20240307": [ - "Claude-3-Haiku-20240307 (Prompt)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-5-sonnet-20240620-FC": [ - "Claude-3.5-Sonnet-20240620 (FC)", - "https://www.anthropic.com/news/claude-3-5-sonnet", - "Anthropic", - "Proprietary", - ], - "claude-3-5-sonnet-20240620": [ - "Claude-3.5-Sonnet-20240620 (Prompt)", - "https://www.anthropic.com/news/claude-3-5-sonnet", - "Anthropic", - "Proprietary", - ], - "gpt-3.5-turbo-0125-FC": [ - "GPT-3.5-Turbo-0125 (FC)", - "https://platform.openai.com/docs/models/gpt-3-5-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-3.5-turbo-0125": [ - "GPT-3.5-Turbo-0125 (Prompting)", - "https://platform.openai.com/docs/models/gpt-3-5-turbo", - "OpenAI", - "Proprietary", - ], - "meetkai/functionary-small-v2.2-FC": [ - "Functionary-Small-v2.2 (FC)", - "https://huggingface.co/meetkai/functionary-small-v2.2", - "MeetKai", - "MIT", - ], - "meetkai/functionary-medium-v2.2-FC": [ - "Functionary-Medium-v2.2 (FC)", - "https://huggingface.co/meetkai/functionary-medium-v2.2", - "MeetKai", - "MIT", - ], - "meetkai/functionary-small-v2.4-FC": [ - "Functionary-Small-v2.4 (FC)", - "https://huggingface.co/meetkai/functionary-small-v2.4", - "MeetKai", - "MIT", - ], - "meetkai/functionary-medium-v2.4-FC": [ - "Functionary-Medium-v2.4 (FC)", - "https://huggingface.co/meetkai/functionary-medium-v2.4", - "MeetKai", - "MIT", - ], - "claude-2.1": [ - "Claude-2.1 (Prompt)", - "https://www.anthropic.com/news/claude-2-1", - "Anthropic", - "Proprietary", - ], - "mistral-tiny-2312": [ - "Mistral-tiny-2312 (Prompt)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "claude-instant-1.2": [ - "Claude-instant-1.2 (Prompt)", - "https://www.anthropic.com/news/releasing-claude-instant-1-2", - "Anthropic", - "Proprietary", - ], - "mistral-small-2402-FC-Auto": [ - "Mistral-small-2402 (FC Auto)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-large-2402-FC-Any": [ - "Mistral-large-2402 (FC Any)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-small-2402-FC-Any": [ - "Mistral-small-2402 (FC Any)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-large-2402-FC-Auto": [ - "Mistral-large-2402 (FC Auto)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "Nexusflow-Raven-v2": [ - "Nexusflow-Raven-v2 (FC)", - "https://huggingface.co/Nexusflow/NexusRaven-V2-13B", - "Nexusflow", - "Apache 2.0", - ], - "firefunction-v1-FC": [ - "FireFunction-v1 (FC)", - "https://huggingface.co/fireworks-ai/firefunction-v1", - "Fireworks", - "Apache 2.0", - ], - "firefunction-v2-FC": [ - "FireFunction-v2 (FC)", - "https://huggingface.co/fireworks-ai/firefunction-v2", - "Fireworks", - "Apache 2.0", - ], - "gemini-1.5-pro-preview-0514": [ - "Gemini-1.5-Pro-Preview-0514 (FC)", - "https://deepmind.google/technologies/gemini/pro/", - "Google", - "Proprietary", - ], - "gemini-1.5-flash-preview-0514": [ - "Gemini-1.5-Flash-Preview-0514 (FC)", - "https://deepmind.google/technologies/gemini/flash/", - "Google", - "Proprietary", - ], - "gemini-1.5-pro-preview-0409": [ - "Gemini-1.5-Pro-Preview-0409 (FC)", - "https://deepmind.google/technologies/gemini/#introduction", - "Google", - "Proprietary", - ], - "gemini-1.0-pro": [ - "Gemini-1.0-Pro-001 (FC)", - "https://deepmind.google/technologies/gemini/#introduction", - "Google", - "Proprietary", - ], - "gpt-4-0613-FC": [ - "GPT-4-0613 (FC)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-0613": [ - "GPT-4-0613 (Prompt)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "deepseek-ai/deepseek-coder-6.7b-instruct": [ - "Deepseek-v1.5 (Prompt)", - "https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5", - "Deepseek", - "Deepseek License", - ], - "google/gemma-7b-it": [ - "Gemma-7b-it (Prompt)", - "https://blog.google/technology/developers/gemma-open-models/", - "Google", - "gemma-terms-of-use", - ], - "glaiveai/glaive-function-calling-v1": [ - "Glaive-v1 (FC)", - "https://huggingface.co/glaiveai/glaive-function-calling-v1", - "Glaive", - "cc-by-sa-4.0", - ], - "databricks-dbrx-instruct": [ - "DBRX-Instruct (Prompt)", - "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm", - "Databricks", - "Databricks Open Model", - ], - "NousResearch/Hermes-2-Pro-Mistral-7B": [ - "Hermes-2-Pro-Mistral-7B (FC)", - "https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B", - "NousResearch", - "apache-2.0", - ], - "meta-llama/Meta-Llama-3-8B-Instruct": [ - "Meta-Llama-3-8B-Instruct (Prompt)", - "https://llama.meta.com/llama3", - "Meta", - "Meta Llama 3 Community", - ], - "meta-llama/Meta-Llama-3-70B-Instruct": [ - "Meta-Llama-3-70B-Instruct (Prompt)", - "https://llama.meta.com/llama3", - "Meta", - "Meta Llama 3 Community", - ], - "command-r-plus-FC": [ - "Command-R-Plus (FC) (Original)", - "https://txt.cohere.com/command-r-plus-microsoft-azure", - "Cohere For AI", - "cc-by-nc-4.0", - ], - "command-r-plus": [ - "Command-R-Plus (Prompt) (Original)", - "https://txt.cohere.com/command-r-plus-microsoft-azure", - "Cohere For AI", - "cc-by-nc-4.0", - ], - "command-r-plus-FC-optimized": [ - "Command-R-Plus (FC) (Optimized)", - "https://txt.cohere.com/command-r-plus-microsoft-azure", - "Cohere For AI", - "cc-by-nc-4.0", - ], - "command-r-plus-optimized": [ - "Command-R-Plus (Prompt) (Optimized)", - "https://txt.cohere.com/command-r-plus-microsoft-azure", - "Cohere For AI", - "cc-by-nc-4.0", - ], - "snowflake/arctic": [ - "Snowflake/snowflake-arctic-instruct (Prompt)", - "https://huggingface.co/Snowflake/snowflake-arctic-instruct", - "Snowflake", - "apache-2.0", - ], - "nvidia/nemotron-4-340b-instruct": [ - "Nemotron-4-340b-instruct (Prompt)", - "https://huggingface.co/nvidia/nemotron-4-340b-instruct", - "NVIDIA", - "nvidia-open-model-license" - ] -} - -INPUT_PRICE_PER_MILLION_TOKEN = { - "claude-3-opus-20240229-FC": 15, - "claude-3-opus-20240229": 15, - "claude-3-sonnet-20240229-FC": 3, - "claude-3-sonnet-20240229": 3, - "claude-3-haiku-20240307-FC": 0.25, - "claude-3-haiku-20240307": 0.25, - "claude-3-5-sonnet-20240620-FC": 3, - "claude-3-5-sonnet-20240620": 3, - "claude-2.1": 8, - "claude-instant-1.2": 0.8, - "mistral-large-2402-FC-Any": 4, - "mistral-large-2402-FC-Auto": 4, - "mistral-medium-2312": 2.7, - "mistral-small-2402-FC-Any": 1, - "mistral-small-2402-FC-Auto": 1, - "mistral-small-2402": 1, - "mistral-tiny-2312": 0.25, - "gpt-4o-2024-05-13-FC": 5, - "gpt-4o-2024-05-13": 5, - "gpt-4-1106-preview-FC": 10, - "gpt-4-1106-preview": 10, - "gpt-4-0125-preview": 10, - "gpt-4-0125-preview-FC": 10, - "gpt-4-turbo-2024-04-09-FC": 10, - "gpt-4-turbo-2024-04-09": 10, - "gpt-4-0613": 30, - "gpt-4-0613-FC": 30, - "gpt-3.5-turbo-0125": 0.5, - "gpt-3.5-turbo-0125-FC": 0.5, - "gemini-1.0-pro": 0.5, - "gemini-1.5-pro-preview-0409": 3.5, - "gemini-1.5-pro-preview-0514": 3.5, - "gemini-1.5-flash-preview-0514": 0.35, - "databricks-dbrx-instruct": 2.25, - "command-r-plus-FC": 3, - "command-r-plus": 3, - "command-r-plus-FC-optimized": 3, - "command-r-plus-optimized": 3, -} - -OUTPUT_PRICE_PER_MILLION_TOKEN = { - "claude-3-opus-20240229-FC": 75, - "claude-3-opus-20240229": 75, - "claude-3-sonnet-20240229-FC": 15, - "claude-3-sonnet-20240229": 15, - "claude-3-5-sonnet-20240620-FC": 15, - "claude-3-5-sonnet-20240620": 15, - "claude-3-haiku-20240307-FC": 1.25, - "claude-3-haiku-20240307": 1.25, - "claude-2.1": 24, - "claude-instant-1.2": 2.4, - "mistral-large-2402-FC-Any": 12, - "mistral-large-2402-FC-Auto": 12, - "mistral-small-2402": 3, - "mistral-medium-2312": 8.1, - "mistral-small-2402-FC-Any": 3, - "mistral-small-2402-FC-Auto": 3, - "mistral-tiny-2312": 0.25, - "gpt-4o-2024-05-13-FC": 15, - "gpt-4o-2024-05-13": 15, - "gpt-4-turbo-2024-04-09-FC": 30, - "gpt-4-turbo-2024-04-09": 30, - "gpt-4-1106-preview": 30, - "gpt-4-1106-preview-FC": 30, - "gpt-4-0125-preview-FC": 30, - "gpt-4-0125-preview": 30, - "gpt-4-0613": 60, - "gpt-4-0613-FC": 60, - "gpt-3.5-turbo-0125": 1.5, - "gpt-3.5-turbo-0125-FC": 1.5, - "gemini-1.0-pro": 1.5, - "gemini-1.5-pro-preview-0409": 10.50, - "gemini-1.5-pro-preview-0514": 10.50, - "gemini-1.5-flash-preview-0514": 0.53, - "databricks-dbrx-instruct": 6.75, - "command-r-plus-FC": 15, - "command-r-plus": 15, - "command-r-plus-FC-optimized": 15, - "command-r-plus-optimized": 15, -} - -# The latency of the open-source models are hardcoded here. -# Because we do batching when generating the data, so the latency is not accurate from the result data. -# This is the latency for the whole batch of data, when using 8 V100 GPUs. -OSS_LATENCY = { - "deepseek-ai/deepseek-coder-6.7b-instruct": 909, - "google/gemma-7b-it": 95, - "NousResearch/Hermes-2-Pro-Mistral-7B": 135, - "meta-llama/Meta-Llama-3-8B-Instruct": 73, - "meta-llama/Meta-Llama-3-70B-Instruct": 307, - "gorilla-openfunctions-v2": 83, -} - - -NO_COST_MODELS = [ - "Nexusflow-Raven-v2", - "firefunction-v1-FC", - "firefunction-v2-FC", - "meetkai/functionary-medium-v2.4-FC", - "meetkai/functionary-small-v2.2-FC", - "meetkai/functionary-small-v2.4-FC", - "snowflake/arctic", - "nvidia/nemotron-4-340b-instruct", -] - -# Price got from AZure, 22.032 per hour for 8 V100, Pay As You Go Total Price -# Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/ -V100_x8_PRICE_PER_HOUR = 22.032 - - -def extract_after_test(input_string): - parts = input_string.split("_test_")[1].split("_result")[0].split(".json")[0] - return parts - - -def find_file_with_suffix(folder_path, suffix): - json_files_pattern = os.path.join(folder_path, "*.json") - for json_file in glob.glob(json_files_pattern): - if extract_after_test(json_file) == suffix: - return json_file - - -def is_executable(test_category): - return "executable" in test_category or "rest" in test_category - - -def is_rest(test_category): - return "rest" in test_category - - -def is_relevance(test_category): - return "relevance" in test_category - - -def is_chatable(test_category): - return "chatable" in test_category - - -def is_java(test_category): - return "java" in test_category - - -def is_js(test_category): - return "javascript" in test_category - - -def is_sql(test_category): - return "sql" in test_category - - -def load_file(file_path): - result = [] - with open(file_path) as f: - file = f.readlines() - for line in file: - result.append(json.loads(line)) - return result - - -def get_handler(model_name): - return handler_map[model_name](model_name) - - -def write_list_of_dicts_to_file(filename, data, subdir=None): - if subdir: - # Ensure the subdirectory exists - os.makedirs(subdir, exist_ok=True) - - # Construct the full path to the file - filename = os.path.join(subdir, filename) - - # Write the list of dictionaries to the file in JSON format - with open(filename, "w") as f: - for i, entry in enumerate(data): - json_str = json.dumps(entry) - f.write(json_str) - if i < len(data) - 1: - f.write("\n") - - -def is_function_calling_format_output(decoded_output): - # Ensure the output is a list of dictionaries - if type(decoded_output) == list: - for item in decoded_output: - if type(item) != dict: - return False - return True - return False - - -def is_executable_format_output(decoded_output): - # Ensure the output is a list of strings (one or more strings) - if type(decoded_output) == list: - if len(decoded_output) == 0: - return False - for item in decoded_output: - if type(item) != str: - return False - return True - return False - - -def is_rest_format_output(decoded_output): - # Ensure the output is a list of one string - if type(decoded_output) == list: - if len(decoded_output) == 1 and type(decoded_output[0]) == str: - return True - return False - - -def is_empty_output(decoded_output): - # This function is a patch to the ast decoder for relevance detection - # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call - # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct) - if not is_function_calling_format_output(decoded_output): - return True - if len(decoded_output) == 0: - return True - if len(decoded_output) == 1 and len(decoded_output[0]) == 0: - return True - - -def api_status_sanity_check_rest(): - - # We only need to import the executable_checker_rest in this function. So a local import is used. - from checker import executable_checker_rest - - ground_truth_dummy = load_file(REST_API_GROUND_TRUTH_FILE_PATH) - - # Use the ground truth data to make sure the API is working correctly - command = f"cd .. ; python apply_function_credential_config.py --input-file ./eval_checker/{REST_API_GROUND_TRUTH_FILE_PATH};" - try: - subprocess.run(command, shell=True, capture_output=True, text=True, check=True) - except subprocess.CalledProcessError as e: - write_list_of_dicts_to_file(REST_API_GROUND_TRUTH_FILE_PATH, ground_truth_dummy) - raise RuntimeError(e.stderr) from e - - ground_truth_replaced = load_file(REST_API_GROUND_TRUTH_FILE_PATH) - write_list_of_dicts_to_file(REST_API_GROUND_TRUTH_FILE_PATH, ground_truth_dummy) - - correct_count = 0 - errors = [] - for idx, data in tqdm( - enumerate(ground_truth_replaced), - total=len(ground_truth_replaced), - desc="API Status Test (REST)", - ): - status = executable_checker_rest(data["ground_truth"], idx) - if status["valid"]: - correct_count += 1 - else: - errors.append((data, status)) - - if correct_count != len(ground_truth_replaced): - [print("Data:", data, "\nError:", status["error"]) for data, status in errors] - error_msg = f"API Status Test Failed for REST Section. {len(ground_truth_replaced) - correct_count} out of {len(ground_truth_replaced)} API behaviors are not as expected. Be careful with executable test category results; they may be inaccurate." - raise BadAPIStatusError(error_msg) - - -def api_status_sanity_check_executable(): - from checker import executable_checker_simple - - ground_truth = load_file(EXECTUABLE_API_GROUND_TRUTH_FILE_PATH) - correct_count = 0 - errors = [] - for data in tqdm( - ground_truth, total=len(ground_truth), desc="API Status Test (Non-REST)" - ): - status = executable_checker_simple( - data["ground_truth"][0], - data["execution_result"][0], - data["execution_result_type"][0], - True, - ) - if status["valid"]: - correct_count += 1 - else: - errors.append((data, status)) - - if correct_count != len(ground_truth): - [print("Data:", data, "\nError:", status["error"]) for data, status in errors] - error_msg = f"API Status Test Failed for Executable Section. {len(ground_truth) - correct_count} out of {len(ground_truth)} API behaviors are not as expected. Be careful with executable test category results; they may be inaccurate." - raise BadAPIStatusError(error_msg) - - -def get_executable_expected_output(prompt_file_path): - # Before we run the evaluation, we need to add the "execution_result" field to the prompt file, using the ground truth data. - prompt_content = load_file(prompt_file_path) - exec_dict = {} - for item in tqdm(prompt_content, desc="Getting Executable Expected Output"): - execution_result = [] - ground_truth = item["ground_truth"] - for i in range(len(ground_truth)): - exec( - "from executable_python_function import *" - + "\nresult=" - + ground_truth[i], - exec_dict, - ) - execution_result.append(exec_dict["result"]) - item["execution_result"] = execution_result - - write_list_of_dicts_to_file(prompt_file_path, prompt_content) - - -def clean_up_executable_expected_output(prompt_path, categories): - for category in categories: - prompt_file = find_file_with_suffix(prompt_path, category) - prompt_content = load_file(prompt_file) - for item in prompt_content: - del item["execution_result"] - write_list_of_dicts_to_file(prompt_file, prompt_content) - - -def calculate_weighted_accuracy(accuracy_dict_list): - total_count = 0 - total_accuracy = 0 - for accuracy_dict in accuracy_dict_list: - total_count += accuracy_dict["total_count"] - total_accuracy += accuracy_dict["accuracy"] * accuracy_dict["total_count"] - - if total_count == 0: - return {"accuracy": 0, "total_count": 0} - - return {"accuracy": total_accuracy / total_count, "total_count": total_count} - - -def calculate_unweighted_accuracy(accuracy_dict_list): - total_accuracy = 0 - for accuracy_dict in accuracy_dict_list: - total_accuracy += accuracy_dict["accuracy"] - - if len(accuracy_dict_list) == 0: - return {"accuracy": 0, "total_count": 0} - - return {"accuracy": total_accuracy / len(accuracy_dict_list), "total_count": 0} - - -def record_result(leaderboard_table, model_name, test_category, accuracy, total_count): - if model_name not in leaderboard_table: - leaderboard_table[model_name] = {} - leaderboard_table[model_name][test_category] = { - "accuracy": accuracy, - "total_count": total_count, - } - - -def record_cost_latency(leaderboard_table, model_name, model_output_data): - if model_name not in leaderboard_table: - leaderboard_table[model_name] = {} - leaderboard_table[model_name]["cost"] = {"input_data": [], "output_data": []} - leaderboard_table[model_name]["latency"] = {"data": []} - - input_token = [] - output_token = [] - latency = [] - for data in model_output_data: - if "latency" in data: - latency.append(data["latency"]) - if data["latency"] > 60: - print("*" * 100) - print( - f"❗️Warning: Latency for one of {model_name} response is {data['latency']}." - ) - print("*" * 100) - if "input_token_count" in data: - if data["input_token_count"] != 0: - input_token.append(data["input_token_count"]) - if "output_token_count" in data: - if data["output_token_count"] != 0: - output_token.append(data["output_token_count"]) - - leaderboard_table[model_name]["cost"]["input_data"].extend(input_token) - leaderboard_table[model_name]["cost"]["output_data"].extend(output_token) - leaderboard_table[model_name]["latency"]["data"].extend(latency) - - -def get_metric(model_name, cost_data, latency_data): - - cost, mean_latency, std_latency, percentile_95_latency = "N/A", "N/A", "N/A", "N/A" - - if ( - model_name in INPUT_PRICE_PER_MILLION_TOKEN - and len(cost_data["input_data"]) > 0 - and len(cost_data["output_data"]) > 0 - ): - - mean_input_token = statistics.mean(cost_data["input_data"]) - mean_output_token = statistics.mean(cost_data["output_data"]) - cost = ( - mean_input_token * INPUT_PRICE_PER_MILLION_TOKEN[model_name] - + mean_output_token * OUTPUT_PRICE_PER_MILLION_TOKEN[model_name] - ) / 1000 - cost = round(cost, 2) - - if model_name in OSS_LATENCY: - mean_latency, std_latency, percentile_95_latency = ( - OSS_LATENCY[model_name] / 1700, - "N/A", - "N/A", - ) - mean_latency = round(mean_latency, 2) - cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600 - cost = round(cost, 2) - - elif len(latency_data["data"]) != 0: - mean_latency = statistics.mean(latency_data["data"]) - std_latency = statistics.stdev(latency_data["data"]) - percentile_95_latency = np.percentile(latency_data["data"], 95) - mean_latency = round(mean_latency, 2) - std_latency = round(std_latency, 2) - percentile_95_latency = round(percentile_95_latency, 2) - - if model_name not in INPUT_PRICE_PER_MILLION_TOKEN: - cost = sum(latency_data["data"]) * V100_x8_PRICE_PER_HOUR / 3600 - cost = round(cost, 2) - - if model_name in NO_COST_MODELS: - cost = "N/A" - - return cost, mean_latency, std_latency, percentile_95_latency - - -def generate_leaderboard_csv(leaderboard_table, output_path): - data = [] - for model_name, value in leaderboard_table.items(): - model_name_escaped = model_name.replace("_", "/") - - python_simple_ast = value.get("simple", {"accuracy": 0, "total_count": 0}) - python_multiple_ast = value.get( - "multiple_function", {"accuracy": 0, "total_count": 0} - ) - python_parallel_ast = value.get( - "parallel_function", {"accuracy": 0, "total_count": 0} - ) - python_parallel_multiple_ast = value.get( - "parallel_multiple_function", {"accuracy": 0, "total_count": 0} - ) - python_simple_exec = value.get( - "executable_simple", {"accuracy": 0, "total_count": 0} - ) - python_multiple_exec = value.get( - "executable_multiple_function", {"accuracy": 0, "total_count": 0} - ) - python_parallel_exec = value.get( - "executable_parallel_function", {"accuracy": 0, "total_count": 0} - ) - python_parallel_multiple_exec = value.get( - "executable_parallel_multiple_function", {"accuracy": 0, "total_count": 0} - ) - java_simple_ast = value.get("java", {"accuracy": 0, "total_count": 0}) - javascript_simple_ast = value.get( - "javascript", {"accuracy": 0, "total_count": 0} - ) - rest_simple_exec = value.get("rest", {"accuracy": 0, "total_count": 0}) - relevance = value.get("relevance", {"accuracy": 0, "total_count": 0}) - - cost_data = value.get("cost", {"input_data": [], "output_data": []}) - latency_data = value.get("latency", {"data": []}) - - simple_ast = calculate_weighted_accuracy( - [python_simple_ast, java_simple_ast, javascript_simple_ast] - ) - multiple_ast = python_multiple_ast - parallel_ast = python_parallel_ast - parallel_multiple_ast = python_parallel_multiple_ast - simple_exec = calculate_weighted_accuracy( - [python_simple_exec, rest_simple_exec] - ) - multiple_exec = python_multiple_exec - parallel_exec = python_parallel_exec - parallel_multiple_exec = python_parallel_multiple_exec - - summary_ast = calculate_unweighted_accuracy( - [simple_ast, multiple_ast, parallel_ast, parallel_multiple_ast] - ) - summary_exec = calculate_unweighted_accuracy( - [simple_exec, multiple_exec, parallel_exec, parallel_multiple_exec] - ) - overall_accuracy = calculate_weighted_accuracy( - [ - simple_ast, - multiple_ast, - parallel_ast, - parallel_multiple_ast, - simple_exec, - multiple_exec, - parallel_exec, - parallel_multiple_exec, - relevance, - ] - ) - - cost, latency_mean, latency_std, percentile_95_latency = get_metric( - model_name_escaped, cost_data, latency_data - ) - - if overall_accuracy["total_count"] != 1700: - print("-" * 100) - print( - f"❗️Warning: Total count for {model_name} is {overall_accuracy['total_count']}" - ) - - data.append( - [ - "N/A", - overall_accuracy["accuracy"], - MODEL_METADATA_MAPPING[model_name_escaped][0], - MODEL_METADATA_MAPPING[model_name_escaped][1], - MODEL_METADATA_MAPPING[model_name_escaped][2], - MODEL_METADATA_MAPPING[model_name_escaped][3], - summary_ast["accuracy"], - summary_exec["accuracy"], - simple_ast["accuracy"], - python_simple_ast["accuracy"], - java_simple_ast["accuracy"], - javascript_simple_ast["accuracy"], - multiple_ast["accuracy"], - parallel_ast["accuracy"], - parallel_multiple_ast["accuracy"], - simple_exec["accuracy"], - python_simple_exec["accuracy"], - rest_simple_exec["accuracy"], - multiple_exec["accuracy"], - parallel_exec["accuracy"], - parallel_multiple_exec["accuracy"], - relevance["accuracy"], - cost, - latency_mean, - latency_std, - percentile_95_latency, - ] - ) - - data.sort(key=lambda x: x[1], reverse=True) - for i in range(len(data)): - data[i][0] = str(i + 1) - data[i][1] = "{:.2f}%".format(data[i][1] * 100) - for j in range(6, len(data[i]) - 4): - data[i][j] = "{:.2f}%".format(data[i][j] * 100) - for j in range(len(data[i]) - 4, len(data[i])): - data[i][j] = str(data[i][j]) - - data.insert(0, COLUMNS) - - filepath = os.path.join(output_path, "data.csv") - with open(filepath, "w") as f: - for i, row in enumerate(data): - if i < len(data) - 1: - f.write(",".join(row) + "\n") - else: - f.write(",".join(row)) - - -def update_leaderboard_table_with_score_file(leaderboard_table, score_path): - - entries = os.scandir(score_path) - - # Filter out the subdirectories - subdirs = [entry.path for entry in entries if entry.is_dir()] - - # Traverse each subdirectory - for subdir in subdirs: - # Pattern to match JSON files in this subdirectory - json_files_pattern = os.path.join(subdir, "*.json") - model_name = subdir.split(score_path)[1] - # Find and process all JSON files in the subdirectory - for model_score_json in glob.glob(json_files_pattern): - metadata = load_file(model_score_json)[0] - accuracy, total_count = metadata["accuracy"], metadata["total_count"] - test_category = model_score_json.split("_score.json")[0].split("/")[-1] - if model_name not in leaderboard_table: - leaderboard_table[model_name] = {} - if test_category not in leaderboard_table[model_name]: - leaderboard_table[model_name][test_category] = { - "accuracy": accuracy, - "total_count": total_count, - } - - -def oss_file_formatter(input_file_path, output_dir): - data = load_file(input_file_path) - assert len(data) == 2000, "OSS result.json file should have 2000 entries." - - for key, value in FILENAME_INDEX_MAPPING.items(): - start, end = value - output_file = os.path.join( - output_dir, f"gorilla_openfunctions_v1_test_{key}_result.json" - ) - with open(output_file, "w") as f: - original_idx = 0 - for i in range(start, end + 1): - new_json = {"id": original_idx, "result": data[i]["text"]} - f.write(json.dumps(new_json) + "\n") - original_idx += 1 - - -def collapse_json_objects(file_path): - with open(file_path, "r") as file: - content = file.read() - - objects = [] - depth = 0 - obj_start = 0 - for i, char in enumerate(content): - if char == "{": - if depth == 0: - obj_start = i - depth += 1 - elif char == "}": - depth -= 1 - if depth == 0: - obj = content[obj_start : i + 1] - objects.append(obj) - - with open(file_path, "w") as out_file: - for obj in objects: - json_obj = json.loads(obj) - compact_json = json.dumps(json_obj, separators=(",", ":")) - out_file.write(compact_json + "\n") diff --git a/berkeley-function-call-leaderboard/eval_data_compilation.py b/berkeley-function-call-leaderboard/eval_data_compilation.py deleted file mode 100644 index 4338faac2..000000000 --- a/berkeley-function-call-leaderboard/eval_data_compilation.py +++ /dev/null @@ -1,37 +0,0 @@ -import json - -data = [] -""" - Compile evaluation data into a single file -""" - -test_files = [ - "executable_parallel_function", - "parallel_multiple_function", - "executable_simple", - "rest", - "sql", - "parallel_function", - "chatable", - "java", - "javascript", - "executable_multiple_function", - "simple", - "relevance", - "executable_parallel_multiple_function", - "multiple_function", -] - -for test_name in test_files: - with open(f"./data/gorilla_openfunctions_v1_test_{test_name}.json", "r") as file: - for line in file: - item = json.loads(line) - item["question_type"] = test_name - data.append(item) - -with open("./eval_data_total.json", "w") as file: - for item in data: - file.write(json.dumps(item)) - file.write("\n") - -print("Data successfully compiled into eval_data_total.json 🦍") diff --git a/berkeley-function-call-leaderboard/function_credential_config.json b/berkeley-function-call-leaderboard/function_credential_config.json deleted file mode 100644 index 9d36e9bbd..000000000 --- a/berkeley-function-call-leaderboard/function_credential_config.json +++ /dev/null @@ -1 +0,0 @@ -[{"RAPID-API-KEY" : ""},{"EXCHANGERATE-API-KEY" : ""},{"OMDB-API-KEY" : ""}, {"GEOCODE-API-KEY": ""}] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/model_handler/arctic_handler.py b/berkeley-function-call-leaderboard/model_handler/arctic_handler.py deleted file mode 100644 index fdfd9d219..000000000 --- a/berkeley-function-call-leaderboard/model_handler/arctic_handler.py +++ /dev/null @@ -1,41 +0,0 @@ -from model_handler.nvidia_handler import NvidiaHandler -from model_handler.utils import ast_parse - -class ArcticHandler(NvidiaHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - def decode_ast(self, result, language="Python"): - result = result.replace("\n", "") - if not result.startswith("["): - result = "[ " + result - if not result.endswith("]"): - result = result + " ]" - if result.startswith("['"): - result = result.replace("['", "[") - result = result.replace("', '", ", ") - result = result.replace("','", ", ") - if result.endswith("']"): - result = result.replace("']", "]") - decode_output = ast_parse(result, language) - return decode_output - - def decode_execute(self, result, language="Python"): - result = result.replace("\n", "") - if not result.startswith("["): - result = "[ " + result - if not result.endswith("]"): - result = result + " ]" - if result.startswith("['"): - result = result.replace("['", "[") - result = result.replace("', '", ", ") - result = result.replace("','", ", ") - if result.endswith("']"): - result = result.replace("']", "]") - decode_output = ast_parse(result, language) - execution_list = [] - for function_call in decode_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py b/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py deleted file mode 100644 index be108408b..000000000 --- a/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py +++ /dev/null @@ -1,88 +0,0 @@ -import json -import os -import time - -from anthropic import Anthropic -from anthropic.types import TextBlock, ToolUseBlock -from model_handler.claude_prompt_handler import ClaudePromptingHandler -from model_handler.constant import GORILLA_TO_OPENAPI -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - ast_parse, - augment_prompt_by_languge, - convert_to_function_call, - convert_to_tool, - language_specific_pre_processing, -) - - -class ClaudeFCHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Anthropic_Prompt - - self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) - - def inference(self, prompt, functions, test_category): - if "FC" not in self.model_name: - handler = ClaudePromptingHandler(self.model_name, self.temperature, self.top_p, self.max_tokens) - return handler.inference(prompt, functions, test_category) - else: - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, True) - if type(functions) is not list: - functions = [functions] - claude_tool = convert_to_tool( - functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True - ) - message = [{"role": "user", "content": prompt}] - start_time = time.time() - - response = self.client.messages.create( - model=self.model_name.strip("-FC"), - max_tokens=self.max_tokens, - tools=claude_tool, - messages=message, - ) - latency = time.time() - start_time - text_outputs = [] - tool_call_outputs = [] - for content in response.content: - if isinstance(content, TextBlock): - text_outputs.append(content.text) - elif isinstance(content, ToolUseBlock): - tool_call_outputs.append({content.name: json.dumps(content.input)}) - result = tool_call_outputs if tool_call_outputs else text_outputs[0] - return result, {"input_tokens": response.usage.input_tokens, "output_tokens": response.usage.output_tokens, "latency": latency} - - def decode_ast(self,result,language="Python"): - if "FC" not in self.model_name: - decoded_output = ast_parse(result,language) - else: - decoded_output = [] - for invoked_function in result: - name = list(invoked_function.keys())[0] - params = json.loads(invoked_function[name]) - if language == "Python": - pass - else: - # all values of the json are casted to string for java and javascript - for key in params: - params[key] = str(params[key]) - decoded_output.append({name: params}) - return decoded_output - - def decode_execute(self,result): - if "FC" not in self.model_name: - decoded_output = ast_parse(result) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list - else: - function_call = convert_to_function_call(result) - return function_call diff --git a/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py b/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py deleted file mode 100644 index bedeb03a8..000000000 --- a/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py +++ /dev/null @@ -1,46 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import convert_to_function_call, ast_parse -import re - - -class DeepseekHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - formatted_prompt = """ - You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n - ### Instruction:\n - You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed-\n - {function}\n - Here is the question: {prompt}\n - Your job is to solve the above question using ONLY and strictly ONE line of python code given the above functions. If you think no function should be invoked return "[]".\n - If you think one or more function should be invoked, return the function call in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)] wrapped in python code" - ### Response:\n - """ - return formatted_prompt.format(function=function, prompt=prompt) - - def inference( - self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, fromat_prompt_func - ) - - def decode_ast(self, result, language="Python"): - function_call = result.split("```")[1] - matches = re.findall(r"\[[^\]]*\]", function_call) - decoded_output = ast_parse(matches[0], language) - return decoded_output - - def decode_execute(self, result): - function_call = result.split("```")[1] - matches = re.findall(r"\[[^\]]*\]", function_call) - decoded_output = ast_parse(matches[0]) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/functionary_handler.py b/berkeley-function-call-leaderboard/model_handler/functionary_handler.py deleted file mode 100644 index 2213e758d..000000000 --- a/berkeley-function-call-leaderboard/model_handler/functionary_handler.py +++ /dev/null @@ -1,26 +0,0 @@ -from model_handler.gpt_handler import OpenAIHandler -from model_handler.model_style import ModelStyle -import os, json -from openai import OpenAI - -# For setup instructions, please refer to https://github.com/MeetKai/functionary for setup details. -class FunctionaryHandler(OpenAIHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens - self.model_name = model_name - self.model_style = ModelStyle.OpenAI - - self.client = OpenAI(base_url="http://localhost:8000/v1", api_key="functionary") - - def write(self, result, file_to_open): - model_name = self.model_name - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists("./result/" + model_name.replace("/", "_")): - os.mkdir("./result/" + model_name.replace("/", "_")) - with open( - "./result/" + model_name.replace("/", "_") + "/" + file_to_open, "a+" - ) as f: - f.write(json.dumps(result) + "\n") diff --git a/berkeley-function-call-leaderboard/model_handler/gemma_handler.py b/berkeley-function-call-leaderboard/model_handler/gemma_handler.py deleted file mode 100644 index fdb1f55d9..000000000 --- a/berkeley-function-call-leaderboard/model_handler/gemma_handler.py +++ /dev/null @@ -1,55 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import ast_parse -import re - - -class GemmaHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - formatted_prompt = """ - user\n - You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed-\n - {function}\n - Here is the questions you need to answer:\n - {prompt}\n - Your job is to solve the above question using ONLY and strictly ONE line of python code given the above functions. If you think no function should be invoked return "[]".\n - If you think one or more function should be invoked, return the function call in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)] wrapped in python code" - \n - model\n - """ - return formatted_prompt.format(function=function, prompt=prompt) - - def inference( - self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, fromat_prompt_func - ) - - def decode_ast(self, result, language="Python"): - pattern = r"\[(.*)\]" - - # Searching for the pattern in the input text - match = re.search(pattern, result, re.DOTALL) - raw_input = match.group(1) - func = "[" + raw_input + "]" - decoded_output = ast_parse(func, language=language) - return decoded_output - - def decode_execute(self, result): - pattern = r"\[(.*)\]" - - # Searching for the pattern in the input text - match = re.search(pattern, result, re.DOTALL) - raw_input = match.group(1) - func = "[" + raw_input + "]" - decoded_output = ast_parse(func) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/glaive_handler.py b/berkeley-function-call-leaderboard/model_handler/glaive_handler.py deleted file mode 100644 index b5cdc6f7c..000000000 --- a/berkeley-function-call-leaderboard/model_handler/glaive_handler.py +++ /dev/null @@ -1,45 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import convert_to_function_call -import json - - -class GlaiveHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - formatted_prompt = """ - SYSTEM: You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed- - {function}\n - USER: {prompt}\n - """ - return formatted_prompt.format(function=function, prompt=prompt) - - def inference( - self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, fromat_prompt_func - ) - - def decode_ast(self, result, language="Python"): - function_call = result.split("")[-1] - function_call = function_call.replace("'", "") - decoded_function = json.loads(function_call) - for key, value in decoded_function["arguments"].items(): - if language == "Python": - pass - else: - # all values of the json are casted to string for java and javascript - decoded_function["arguments"][key] = str( - decoded_function["arguments"][key] - ) - decoded_result = [{decoded_function["name"]: decoded_function["arguments"]}] - return decoded_result - - def decode_execute(self, result): - function_call = result.split("")[-1] - function_call = function_call.replace("'", "") - decoded_function = json.loads(function_call) - decoded_result = [{decoded_function["name"]: decoded_function["arguments"]}] - return convert_to_function_call(decoded_result) diff --git a/berkeley-function-call-leaderboard/model_handler/handler.py b/berkeley-function-call-leaderboard/model_handler/handler.py deleted file mode 100644 index dcad5eeda..000000000 --- a/berkeley-function-call-leaderboard/model_handler/handler.py +++ /dev/null @@ -1,50 +0,0 @@ -from model_handler.model_style import ModelStyle -import json, os - - -class BaseHandler: - model_name: str - model_style: ModelStyle - - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.model_name = model_name - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens - - def inference(self, prompt, functions, test_category): - # This method is used to retrive model response for each model. - pass - - def decode_ast(self, result, language="Python"): - # This method takes raw model output and convert it to standard AST checker input. - pass - - def decode_execute(self, result): - # This method takes raw model output and convert it to standard execute checker input. - pass - - def write(self, result, file_to_open): - # This method is used to write the result to the file. - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists("./result/" + self.model_name): - os.mkdir("./result/" + self.model_name) - with open( - "./result/" - + self.model_name - + "/" - + file_to_open.replace(".json", "_result.json"), - "a+", - ) as f: - f.write(json.dumps(result) + "\n") - - def load_result(self, test_category): - # This method is used to load the result from the file. - result_list = [] - with open( - f"./result/{self.model_name}/gorilla_openfunctions_v1_test_{test_category}_result.json" - ) as f: - for line in f: - result_list.append(json.loads(line)) - return result_list diff --git a/berkeley-function-call-leaderboard/model_handler/handler_map.py b/berkeley-function-call-leaderboard/model_handler/handler_map.py deleted file mode 100644 index bc72c105b..000000000 --- a/berkeley-function-call-leaderboard/model_handler/handler_map.py +++ /dev/null @@ -1,79 +0,0 @@ -from model_handler.arctic_handler import ArcticHandler -from model_handler.claude_fc_handler import ClaudeFCHandler -from model_handler.claude_prompt_handler import ClaudePromptingHandler -from model_handler.cohere_handler import CohereHandler -from model_handler.databricks_handler import DatabricksHandler -from model_handler.deepseek_handler import DeepseekHandler -from model_handler.firework_ai_handler import FireworkAIHandler -from model_handler.functionary_handler import FunctionaryHandler -from model_handler.gemini_handler import GeminiHandler -from model_handler.gemma_handler import GemmaHandler -from model_handler.glaive_handler import GlaiveHandler -from model_handler.gorilla_handler import GorillaHandler -from model_handler.gpt_handler import OpenAIHandler -from model_handler.hermes_handler import HermesHandler -from model_handler.llama_handler import LlamaHandler -from model_handler.mistral_handler import MistralHandler -from model_handler.nexus_handler import NexusHandler -from model_handler.oss_handler import OSSHandler -from model_handler.nvidia_handler import NvidiaHandler - -handler_map = { - "gorilla-openfunctions-v0": GorillaHandler, - "gorilla-openfunctions-v2": GorillaHandler, - "gpt-4o-2024-05-13": OpenAIHandler, - "gpt-4o-2024-05-13-FC": OpenAIHandler, - "gpt-4-turbo-2024-04-09-FC": OpenAIHandler, - "gpt-4-turbo-2024-04-09": OpenAIHandler, - "gpt-4-1106-preview-FC": OpenAIHandler, - "gpt-4-1106-preview": OpenAIHandler, - "gpt-4-0125-preview-FC": OpenAIHandler, - "gpt-4-0125-preview": OpenAIHandler, - "gpt-4-0613-FC": OpenAIHandler, - "gpt-4-0613": OpenAIHandler, - "gpt-3.5-turbo-0125-FC": OpenAIHandler, - "gpt-3.5-turbo-0125": OpenAIHandler, - "claude-2.1": ClaudePromptingHandler, - "claude-instant-1.2": ClaudePromptingHandler, - "claude-3-opus-20240229": ClaudePromptingHandler, - "claude-3-opus-20240229-FC": ClaudeFCHandler, - "claude-3-sonnet-20240229": ClaudePromptingHandler, - "claude-3-sonnet-20240229-FC": ClaudeFCHandler, - "claude-3-haiku-20240307": ClaudePromptingHandler, - "claude-3-haiku-20240307-FC": ClaudeFCHandler, - "claude-3-5-sonnet-20240620": ClaudePromptingHandler, - "claude-3-5-sonnet-20240620-FC": ClaudeFCHandler, - "mistral-large-2402": MistralHandler, - "mistral-large-2402-FC-Any": MistralHandler, - "mistral-large-2402-FC-Auto": MistralHandler, - "mistral-medium-2312": MistralHandler, - "mistral-small-2402": MistralHandler, - "mistral-small-2402-FC-Any": MistralHandler, - "mistral-small-2402-FC-Auto": MistralHandler, - "mistral-tiny-2312": MistralHandler, - "firefunction-v1-FC": FireworkAIHandler, - "firefunction-v2-FC": FireworkAIHandler, - "Nexusflow-Raven-v2": NexusHandler, - "gemini-1.0-pro": GeminiHandler, - "gemini-1.5-pro-preview-0409": GeminiHandler, - "gemini-1.5-pro-preview-0514": GeminiHandler, - "gemini-1.5-flash-preview-0514": GeminiHandler, - "gemma": OSSHandler, - "google/gemma-7b-it": GemmaHandler, - "glaiveai/glaive-function-calling-v1": GlaiveHandler, - "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler, - "meetkai/functionary-small-v2.2-FC": FunctionaryHandler, - "meetkai/functionary-medium-v2.2-FC": FunctionaryHandler, - "meetkai/functionary-small-v2.4-FC": FunctionaryHandler, - "meetkai/functionary-medium-v2.4-FC": FunctionaryHandler, - "databricks-dbrx-instruct": DatabricksHandler, - "NousResearch/Hermes-2-Pro-Mistral-7B": HermesHandler, - "meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler, - "meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler, - "command-r-plus-FC": CohereHandler, - "command-r-plus": CohereHandler, - "command-r-plus-FC-optimized": CohereHandler, - "command-r-plus-optimized": CohereHandler, - "snowflake/arctic": ArcticHandler, - "nvidia/nemotron-4-340b-instruct": NvidiaHandler, -} diff --git a/berkeley-function-call-leaderboard/model_handler/hermes_handler.py b/berkeley-function-call-leaderboard/model_handler/hermes_handler.py deleted file mode 100644 index 4d59555cd..000000000 --- a/berkeley-function-call-leaderboard/model_handler/hermes_handler.py +++ /dev/null @@ -1,92 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import convert_to_tool -from model_handler.constant import GORILLA_TO_OPENAPI -from model_handler.model_style import ModelStyle -import json - - -class HermesHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - # Hermes use Langchain to OpenAI conversion. It does not use tool call but function call. - function = convert_to_tool( - function, GORILLA_TO_OPENAPI, ModelStyle.OSSMODEL, test_category, True - ) - pydantic_format = """{"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}""" - tool_call_format = """{"arguments": , "name": }""" - formatted_prompt = """ -<|im_start|>system -You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: {function} Use the following pydantic model json schema for each tool call you will make: {pydantic_format} For each function call return a json object with function name and arguments within XML tags as follows: - -{tool_call_format} - -<|im_end|> -<|im_start|>user -{prompt} -<|im_end|> - """ - return formatted_prompt.format( - function=function, - pydantic_format=pydantic_format, - tool_call_format=tool_call_format, - prompt=prompt, - ) - - def inference( - self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, format_prompt_func - ) - - def decode_ast(self, result, language="Python"): - lines = result.split("\n") - flag = False - func_call = [] - for line in lines: - if "" == line: - flag = True - elif "" == line: - flag = False - else: - if flag: - line = line.replace("'", '"') - tool_result = json.loads(line) - if language == "Python": - pass - else: - # all values of the json are casted to string for java and javascript - for key in tool_result["arguments"]: - tool_result["arguments"][key] = str( - tool_result["arguments"][key] - ) - func_call.append({tool_result["name"]: tool_result["arguments"]}) - flag = False - return func_call - - def decode_execute(self, result): - lines = result.split("\n") - flag = False - function_call_list = [] - for line in lines: - if "" == line: - flag = True - elif "" == line: - flag = False - else: - if flag: - line = line.replace("'", '"') - tool_result = json.loads(line) - function_call_list.append( - {tool_result["name"]: tool_result["arguments"]} - ) - flag = False - execution_list = [] - for function_call in function_call_list: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k,v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/llama_handler.py b/berkeley-function-call-leaderboard/model_handler/llama_handler.py deleted file mode 100644 index 7b1e3fd5c..000000000 --- a/berkeley-function-call-leaderboard/model_handler/llama_handler.py +++ /dev/null @@ -1,48 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import ast_parse -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, -) - - -class LlamaHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - conversations = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{SYSTEM_PROMPT_FOR_CHAT_MODEL}<|eot_id|><|start_header_id|>user<|end_header_id|>{USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, functions=str(function))}<|eot_id|><|start_header_id|>assistant<|end_header_id|>""" - return conversations - - def inference( - self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, format_prompt_func - ) - - def decode_ast(self, result, language="Python"): - func = result - func = func.replace("\n", "") # remove new line characters - if not func.startswith("["): - func = "[" + func - if not func.endswith("]"): - func = func + "]" - decoded_output = ast_parse(func, language) - return decoded_output - - def decode_execute(self, result): - func = result - func = func.replace("\n", "") # remove new line characters - if not func.startswith("["): - func = "[" + func - if not func.endswith("]"): - func = func + "]" - decode_output = ast_parse(func) - execution_list = [] - for function_call in decode_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/model_style.py b/berkeley-function-call-leaderboard/model_handler/model_style.py deleted file mode 100644 index 81b8e30f1..000000000 --- a/berkeley-function-call-leaderboard/model_handler/model_style.py +++ /dev/null @@ -1,14 +0,0 @@ -from enum import Enum - - -class ModelStyle(Enum): - Gorilla = "gorilla" - OpenAI = "gpt" - Anthropic_FC = "claude" - Anthropic_Prompt = "claude" - Mistral = "mistral" - Google = "google" - FIREWORK_AI = "firework_ai" - NEXUS = "nexus" - OSSMODEL = "ossmodel" - COHERE = "cohere" diff --git a/berkeley-function-call-leaderboard/model_handler/oss_handler.py b/berkeley-function-call-leaderboard/model_handler/oss_handler.py deleted file mode 100644 index 206107878..000000000 --- a/berkeley-function-call-leaderboard/model_handler/oss_handler.py +++ /dev/null @@ -1,152 +0,0 @@ -import json -import os - -import ray -import shortuuid -import torch -from eval_checker.eval_checker_constant import FILENAME_INDEX_MAPPING -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - ast_parse, - augment_prompt_by_languge, - language_specific_pre_processing, -) - -class OSSHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.OSSMODEL - self._init_model() - - def _init_model(self): - ray.init(ignore_reinit_error=True, num_cpus=8) - - def _format_prompt(prompt, function, test_category): - SYSTEM_PROMPT = """ - You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed- - """ - functions = "" - if isinstance(function, list): - for idx, func in enumerate(function): - functions += "\n" + str(func) - else: - functions += "\n" + str(function) - return f"SYSTEM: {SYSTEM_PROMPT}\n{functions}\nUSER: {prompt}\nASSISTANT: " - - @ray.remote(num_gpus=1) - @torch.inference_mode() - def _batch_generate( - question_jsons, - test_category, - model_path, - temperature, - max_tokens, - top_p, - format_prompt_func, - index, - ): - from vllm import LLM, SamplingParams - - prompts = [] - ans_jsons = [] - for line in question_jsons: - for key, value in FILENAME_INDEX_MAPPING.items(): - start, end = value - if index >= start and index < end: - test_category = key - break - ques_json = line - prompt = augment_prompt_by_languge(ques_json["question"], test_category) - functions = language_specific_pre_processing( - ques_json["function"], test_category, False - ) - prompts.append(format_prompt_func(prompt, functions, test_category)) - ans_id = shortuuid.uuid() - ans_jsons.append( - { - "answer_id": ans_id, - "question": ques_json["question"], - } - ) - - print("start generating: ", len(prompts)) - sampling_params = SamplingParams( - temperature=temperature, max_tokens=max_tokens, top_p=top_p - ) - llm = LLM(model=model_path, dtype="float16", trust_remote_code=True) - outputs = llm.generate(prompts, sampling_params) - final_ans_jsons = [] - for output, ans_json in zip(outputs, ans_jsons): - text = output.outputs[0].text - ans_json["text"] = text - final_ans_jsons.append(ans_json) - return final_ans_jsons - - def inference( - self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt - ): - - ques_jsons = [] - with open(question_file, "r") as ques_file: - for line in ques_file: - ques_jsons.append(json.loads(line)) - - chunk_size = len(ques_jsons) // num_gpus - ans_handles = [] - for i in range(0, len(ques_jsons), chunk_size): - ans_handles.append( - self._batch_generate.remote( - ques_jsons[i : i + chunk_size], - test_category, - self.model_name, - self.temperature, - self.max_tokens, - self.top_p, - format_prompt_func, - i, - ) - ) - ans_jsons = [] - for ans_handle in ans_handles: - ans_jsons.extend(ray.get(ans_handle)) - - return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0} - - def decode_ast(self, result, language="Python"): - func = result - if " " == func[0]: - func = func[1:] - if not func.startswith("["): - func = "[" + func - if not func.endswith("]"): - func = func + "]" - decode_output = ast_parse(func, language) - return decode_output - - def decode_execute(self, result): - return result - - def write(self, result, file_to_open): - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists("./result/" + self.model_name.replace("/", "_")): - os.mkdir("./result/" + self.model_name.replace("/", "_")) - with open( - "./result/" + self.model_name.replace("/", "_") + "/" + file_to_open, "a+" - ) as f: - f.write(json.dumps(result) + "\n") - - def load_result(self, test_category): - eval_data = [] - with open("./eval_data_total.json") as f: - for line in f: - eval_data.append(json.loads(line)) - result_list = [] - idx = 0 - with open(f"./result/{self.model_name}/result.json") as f: - for line in f: - if eval_data[idx]["test_category"] == test_category: - result_list.append(json.loads(line)) - idx += 1 - return result_list diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py deleted file mode 100644 index cbf0f7f84..000000000 --- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py +++ /dev/null @@ -1,110 +0,0 @@ -import argparse, json, os -from tqdm import tqdm -from model_handler.handler_map import handler_map -from model_handler.model_style import ModelStyle -from model_handler.constant import USE_COHERE_OPTIMIZATION - - -def get_args(): - parser = argparse.ArgumentParser() - # Refer to model_choice for supported models. - parser.add_argument("--model", type=str, default="gorilla-openfunctions-v2") - # Refer to test_categories for supported categories. - parser.add_argument("--test-category", type=str, default="all") - - # Parameters for the model that you want to test. - parser.add_argument("--temperature", type=float, default=0.7) - parser.add_argument("--top-p", type=float, default=1) - parser.add_argument("--max-tokens", type=int, default=1200) - parser.add_argument("--num-gpus", default=1, type=int) - parser.add_argument("--timeout", default=60, type=int) - - args = parser.parse_args() - return args - - -test_categories = { - "executable_simple": "gorilla_openfunctions_v1_test_executable_simple.json", - "executable_parallel_function": "gorilla_openfunctions_v1_test_executable_parallel_function.json", - "executable_multiple_function": "gorilla_openfunctions_v1_test_executable_multiple_function.json", - "executable_parallel_multiple_function": "gorilla_openfunctions_v1_test_executable_parallel_multiple_function.json", - "simple": "gorilla_openfunctions_v1_test_simple.json", - "relevance": "gorilla_openfunctions_v1_test_relevance.json", - "parallel_function": "gorilla_openfunctions_v1_test_parallel_function.json", - "multiple_function": "gorilla_openfunctions_v1_test_multiple_function.json", - "parallel_multiple_function": "gorilla_openfunctions_v1_test_parallel_multiple_function.json", - "java": "gorilla_openfunctions_v1_test_java.json", - "javascript": "gorilla_openfunctions_v1_test_javascript.json", - "rest": "gorilla_openfunctions_v1_test_rest.json", - "sql": "gorilla_openfunctions_v1_test_sql.json", -} - - -def build_handler(model_name, temperature, top_p, max_tokens): - handler = handler_map[model_name](model_name, temperature, top_p, max_tokens) - return handler - - -def load_file(test_category): - if test_category == "all": - test_cate, files_to_open = list(test_categories.keys()), list( - test_categories.values() - ) - else: - test_cate, files_to_open = [test_category], [test_categories[test_category]] - return test_cate, files_to_open - - -if __name__ == "__main__": - args = get_args() - if USE_COHERE_OPTIMIZATION and "command-r-plus" in args.model: - args.model = args.model + "-optimized" - handler = build_handler(args.model, args.temperature, args.top_p, args.max_tokens) - if handler.model_style == ModelStyle.OSSMODEL: - result = handler.inference( - question_file="eval_data_total.json", - test_category=args.test_category, - num_gpus=args.num_gpus, - ) - for res in result[0]: - handler.write(res, "result.json") - else: - test_cate, files_to_open = load_file(args.test_category) - for test_category, file_to_open in zip(test_cate, files_to_open): - print("Generating: " + file_to_open) - test_cases = [] - with open("./data/" + file_to_open) as f: - for line in f: - test_cases.append(json.loads(line)) - num_existing_result = 0 # if the result file already exists, skip the test cases that have been tested. - if os.path.exists( - "./result/" - + args.model.replace("/", "_") - + "/" - + file_to_open.replace(".json", "_result.json") - ): - with open( - "./result/" - + args.model.replace("/", "_") - + "/" - + file_to_open.replace(".json", "_result.json") - ) as f: - for line in f: - num_existing_result += 1 - for index, test_case in enumerate(tqdm(test_cases)): - if index < num_existing_result: - continue - user_question, functions = test_case["question"], test_case["function"] - if type(functions) is dict or type(functions) is str: - functions = [functions] - result, metadata = handler.inference( - user_question, functions, test_category - ) - result_to_write = { - "idx": index, - "result": result, - "input_token_count": metadata["input_tokens"], - "output_token_count": metadata["output_tokens"], - "latency": metadata["latency"], - } - handler.write(result_to_write, file_to_open) diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml new file mode 100644 index 000000000..2354df783 --- /dev/null +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = ["setuptools>=40.8.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "bfcl" +version = "0.1.0" +description = "Berkeley Function Calling Leaderboard (BFCL)" +authors = [ + {name="Shishir Patil", email="sgp@berkeley.edu"} +] +readme = "README.md" +requires-python = ">=3.9" +license = { "text" = "Apache 2.0" } +dependencies = [ + "requests==2.32.3", + "tqdm==4.66.4", + "numpy==1.26.4", + "pandas", + "huggingface_hub", + "pydantic>=2.8.2", + "python-dotenv>=1.0.1", + "tree_sitter==0.21.3", + "tree-sitter-java==0.21.0", + "tree-sitter-javascript==0.21.4", + "openai==1.35.13", +] + +[tool.setuptools.packages.find] +include = ["bfcl*"] + +[project.scripts] +bfcl = "bfcl.cli:main" + +[project.urls] +Repository = "https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard" + +[project.optional-dependencies] +oss_eval = ["vllm==0.5.0"] +proprietary_eval = [ + "mistralai==0.4.2", + "anthropic==0.31.1", + "cohere==5.5.8", +] +all = [ + "bfcl[oss_eval]", + "bfcl[proprietary_eval]", +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/requirements.txt b/berkeley-function-call-leaderboard/requirements.txt deleted file mode 100644 index 974024adf..000000000 --- a/berkeley-function-call-leaderboard/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -requests -tqdm -tree_sitter~=0.21.0 -torch -ray -shortuuid -mistralai -anthropic~=0.29.0 -openai -numpy -cohere~=5.2.5 diff --git a/berkeley-function-call-leaderboard/setup.py b/berkeley-function-call-leaderboard/setup.py new file mode 100644 index 000000000..e81bcd1c6 --- /dev/null +++ b/berkeley-function-call-leaderboard/setup.py @@ -0,0 +1,4 @@ +import setuptools + +# This is to make sure that the package supports editable installs +setuptools.setup() \ No newline at end of file