From 5ed29e2d0b0a9c9a995725db3d53921104060e72 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Sat, 21 Dec 2024 05:47:18 -0800 Subject: [PATCH] [BFCL] Skip Executable Categories When API Keys Missing (#848) This PR improves the behavior of the generation and evaluation pipeline. When executable categories are involved and API keys are not provided in the `.env` file, instead of throwing an error, the affected categories will now be skipped. This enhancement provides a smoother experience for first-time users. 1. What will happen to overall score? What would be the difference between score on BFCL official leaderboard vs. without Executable? If the API Key is not provided, that category will not be evaluated and will be treated as 0 by default in the overall score calculation, which means the overall score (and the one on the leaderboard) will be hurt if the API Keys are not supplied. PR #849 should make things more clear. 2. What percentage of executable are there? 310 in total, out of 4751 entries. --- .../CHANGELOG.md | 1 + .../SUPPORTED_MODELS.md | 3 +- .../bfcl/__main__.py | 4 +- .../bfcl/_llm_response_generation.py | 73 ++++++++++++------- .../bfcl/eval_checker/eval_runner.py | 52 ++++++++----- .../bfcl/utils.py | 32 +++++++- 6 files changed, 116 insertions(+), 49 deletions(-) diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md index 47696e688..195ec3777 100644 --- a/berkeley-function-call-leaderboard/CHANGELOG.md +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -2,6 +2,7 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file. +- [Dec 21, 2024] [#848](https://github.com/ShishirPatil/gorilla/pull/848): Improves behavior for generation and evaluation pipeline. When executable categories are involved and API keys are not provided in the `.env` file, instead of throwing an error, the affected categories will now be skipped. This enhancement provides a smoother experience for first-time users. - [Dec 21, 2024] [#847](https://github.com/ShishirPatil/gorilla/pull/847): Add new model `watt-ai/watt-tool-8B` and `watt-ai/watt-tool-70B` to the leaderboard. - [Dec 20, 2024] [#842](https://github.com/ShishirPatil/gorilla/pull/842): Add the following new models to the leaderboard: - `Qwen/Qwen2.5-0.5B-Instruct` diff --git a/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md b/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md index 8ae712b3e..bf3a7c927 100644 --- a/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md +++ b/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md @@ -89,8 +89,7 @@ Below is a comprehensive table of models supported for running leaderboard evalu |openbmb/MiniCPM3-4B 💻| Prompt| |THUDM/glm-4-9b-chat 💻| Function Calling| |Team-ACE/ToolACE-8B 💻| Function Calling| -|watt-ai/watt-tool-8B 💻| Function Calling| -|watt-ai/watt-tool-70B 💻| Function Calling| +|watt-ai/watt-tool-{8B,70B} 💻| Function Calling| --- diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index fce7ef586..d4fb64f30 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -88,7 +88,7 @@ def models(): def generate( model: List[str] = typer.Option( ["gorilla-openfunctions-v2"], - help="A list of model names to evaluate. Use commas to separate multiple models.", + help="A list of model names to generate the llm response. Use commas to separate multiple models.", callback=handle_multiple_input ), test_category: List[str] = typer.Option( @@ -216,7 +216,7 @@ def evaluate( callback=handle_multiple_input ), test_category: List[str] = typer.Option( - None, + ["all"], help="A list of test categories to run the evaluation on.", callback=handle_multiple_input ), diff --git a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py index 360ac62f7..e4f73ea5d 100644 --- a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py +++ b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py @@ -12,7 +12,6 @@ PROJECT_ROOT, PROMPT_PATH, RESULT_PATH, - TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, TEST_IDS_TO_GENERATE_PATH, ) @@ -20,8 +19,10 @@ from bfcl.model_handler.handler_map import HANDLER_MAP from bfcl.model_handler.model_style import ModelStyle from bfcl.utils import ( + check_api_key_supplied, is_executable, is_multi_turn, + parse_test_category_argument, sort_key, ) from tqdm import tqdm @@ -58,24 +59,10 @@ def build_handler(model_name, temperature): return handler -def parse_test_category_argument(test_category_args): - test_name_total = set() - test_filename_total = set() - - for test_category in test_category_args: - if test_category in TEST_COLLECTION_MAPPING: - for test_name in TEST_COLLECTION_MAPPING[test_category]: - test_name_total.add(test_name) - test_filename_total.add(TEST_FILE_MAPPING[test_name]) - else: - test_name_total.add(test_category) - test_filename_total.add(TEST_FILE_MAPPING[test_category]) - - return sorted(list(test_name_total)), sorted(list(test_filename_total)) - - def get_involved_test_entries(test_category_args, run_ids): all_test_file_paths, all_test_categories, all_test_entries_involved = [], [], [] + api_key_supplied = check_api_key_supplied() + skipped_categories = [] if run_ids: with open(TEST_IDS_TO_GENERATE_PATH) as f: test_ids_to_generate = json.load(f) @@ -84,17 +71,38 @@ def get_involved_test_entries(test_category_args, run_ids): continue test_file_path = TEST_FILE_MAPPING[category] all_test_entries_involved.extend( - [entry for entry in load_file(PROMPT_PATH / test_file_path) if entry["id"] in test_ids] + [ + entry + for entry in load_file(PROMPT_PATH / test_file_path) + if entry["id"] in test_ids + ] ) - all_test_categories.append(category) - all_test_file_paths.append(test_file_path) + # Skip executable test category if api key is not provided in the .env file + if is_executable(category) and not api_key_supplied: + skipped_categories.append(category) + else: + all_test_categories.append(category) + all_test_file_paths.append(test_file_path) else: - all_test_categories, all_test_file_paths = parse_test_category_argument(test_category_args) - for test_category, file_to_open in zip(all_test_categories, all_test_file_paths): - all_test_entries_involved.extend(load_file(PROMPT_PATH / file_to_open)) + all_test_file_paths, all_test_categories = parse_test_category_argument(test_category_args) + # Make a copy here since we are removing list elemenets inside the for loop + for test_category, file_to_open in zip( + all_test_categories[:], all_test_file_paths[:] + ): + if is_executable(test_category) and not api_key_supplied: + all_test_categories.remove(test_category) + all_test_file_paths.remove(file_to_open) + skipped_categories.append(test_category) + else: + all_test_entries_involved.extend(load_file(PROMPT_PATH / file_to_open)) - return all_test_file_paths, all_test_categories, all_test_entries_involved + return ( + all_test_file_paths, + all_test_categories, + all_test_entries_involved, + skipped_categories, + ) def collect_test_cases( @@ -263,9 +271,12 @@ def main(args): if type(args.test_category) is not list: args.test_category = [args.test_category] - all_test_file_paths, all_test_categories, all_test_entries_involved = ( - get_involved_test_entries(args.test_category, args.run_ids) - ) + ( + all_test_file_paths, + all_test_categories, + all_test_entries_involved, + skipped_categories, + ) = get_involved_test_entries(args.test_category, args.run_ids) print(f"Generating results for {args.model}") if args.run_ids: @@ -273,7 +284,15 @@ def main(args): else: print(f"Running full test cases for categories: {all_test_categories}.") + if len(skipped_categories) > 0: + print("----------") + print( + f"❗️ Note: The following executable test category entries will be skipped because they require API Keys to be provided in the .env file: {skipped_categories}.\n Please refer to the README.md 'API Keys for Executable Test Categories' section for details.\n The model response for other categories will still be generated." + ) + print("----------") + # Apply function credential config if any of the test categories are executable + # We can know for sure that any executable categories will not be included if the API Keys are not supplied. if any([is_executable(category) for category in all_test_categories]): apply_function_credential_config(input_path=PROMPT_PATH) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py index 8c2bed57d..d328a8a5a 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -481,7 +481,7 @@ def runner(model_names, test_categories, api_sanity_check, result_dir, score_dir # Find and process all JSON files in the subdirectory for model_result_json in subdir.glob("*.json"): test_category = extract_test_category(model_result_json) - if test_categories is not None and test_category not in test_categories: + if test_category not in test_categories: continue handler = get_handler(model_name_escaped) @@ -614,15 +614,8 @@ def runner(model_names, test_categories, api_sanity_check, result_dir, score_dir API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=False ) - print( - f"🏁 Evaluation completed. See {score_dir / 'data_overall.csv'} for overall evaluation results on BFCL V3." - ) - print( - f"See {score_dir / 'data_live.csv'}, {score_dir / 'data_non_live.csv'} and {score_dir / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively." - ) - -def main(model, test_category, api_sanity_check, result_dir, score_dir): +def main(model, test_categories, api_sanity_check, result_dir, score_dir): if result_dir is None: result_dir = RESULT_PATH else: @@ -633,14 +626,23 @@ def main(model, test_category, api_sanity_check, result_dir, score_dir): else: result_dir = (PROJECT_ROOT / score_dir).resolve() - test_categories = None - if test_category is not None: - test_categories = [] - for category in test_category: - if category in TEST_COLLECTION_MAPPING: - test_categories.extend(TEST_COLLECTION_MAPPING[category]) + if type(test_categories) is not list: + test_categories = [test_categories] + + _, all_test_categories = parse_test_category_argument(test_categories) + + api_key_supplied = check_api_key_supplied() + skipped_categories = [] + + for test_category in all_test_categories[:]: + # Skip executable test category evaluation if api key is not provided in the .env file + if is_executable(test_category) and not api_key_supplied: + # We can still run the REST category, since the API keys are baked in the model response. So as long as the model response is generated, we can evaluate. + if is_rest(test_category): + continue else: - test_categories.append(category) + all_test_categories.remove(test_category) + skipped_categories.append(test_category) model_names = None if model is not None: @@ -651,7 +653,22 @@ def main(model, test_category, api_sanity_check, result_dir, score_dir): # We patch it here to avoid confusing the user. model_names.append(model_name.replace("/", "_")) - runner(model_names, test_categories, api_sanity_check, result_dir, score_dir) + # Driver function to run the evaluation for all categories involved. + runner(model_names, all_test_categories, api_sanity_check, result_dir, score_dir) + + if len(skipped_categories) > 0: + print("----------") + print( + f"❗️ Note: The following executable test category are not evaluated because they require API Keys to be provided in the .env file: {skipped_categories}.\n Please refer to the README.md 'API Keys for Executable Test Categories' section for details.\n The model response for other categories are evaluated." + ) + print("----------") + + print( + f"🏁 Evaluation completed. See {score_dir / 'data_overall.csv'} for overall evaluation results on BFCL V3." + ) + print( + f"See {score_dir / 'data_live.csv'}, {score_dir / 'data_non_live.csv'} and {score_dir / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively." + ) def get_handler(model_name): @@ -671,6 +688,7 @@ def get_handler(model_name): "--test-category", nargs="+", type=str, + default="all", help="A list of test categories to run the evaluation on", ) parser.add_argument( diff --git a/berkeley-function-call-leaderboard/bfcl/utils.py b/berkeley-function-call-leaderboard/bfcl/utils.py index c85c518ef..a1fb4ccb4 100644 --- a/berkeley-function-call-leaderboard/bfcl/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/utils.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Union -from bfcl.constant import VERSION_PREFIX +from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, VERSION_PREFIX def extract_test_category(input_string: Union[str, Path]) -> str: @@ -178,3 +178,33 @@ def is_empty_output(decoded_output): if len(decoded_output) == 1 and len(decoded_output[0]) == 0: return True return False + + +def check_api_key_supplied() -> bool: + """ + This function checks if the four API Keys needed for the executable categoreis are provided. If not, those categories will be skipped. + """ + ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY") + for var in ENV_VARS: + if os.getenv(var) == "": + return False + return True + + +def parse_test_category_argument(test_category_args): + test_name_total = set() + test_filename_total = set() + + for test_category in test_category_args: + if test_category in TEST_COLLECTION_MAPPING: + for test_name in TEST_COLLECTION_MAPPING[test_category]: + test_name_total.add(test_name) + test_filename_total.add(TEST_FILE_MAPPING[test_name]) + elif test_category in TEST_FILE_MAPPING: + test_name_total.add(test_category) + test_filename_total.add(TEST_FILE_MAPPING[test_category]) + else: + # Invalid test category name + raise Exception(f"Invalid test category name provided: {test_category}") + + return sorted(list(test_filename_total)), sorted(list(test_name_total))