diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md index 47696e688..195ec3777 100644 --- a/berkeley-function-call-leaderboard/CHANGELOG.md +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -2,6 +2,7 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file. +- [Dec 21, 2024] [#848](https://github.com/ShishirPatil/gorilla/pull/848): Improves behavior for generation and evaluation pipeline. When executable categories are involved and API keys are not provided in the `.env` file, instead of throwing an error, the affected categories will now be skipped. This enhancement provides a smoother experience for first-time users. - [Dec 21, 2024] [#847](https://github.com/ShishirPatil/gorilla/pull/847): Add new model `watt-ai/watt-tool-8B` and `watt-ai/watt-tool-70B` to the leaderboard. - [Dec 20, 2024] [#842](https://github.com/ShishirPatil/gorilla/pull/842): Add the following new models to the leaderboard: - `Qwen/Qwen2.5-0.5B-Instruct` diff --git a/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md b/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md index 8ae712b3e..bf3a7c927 100644 --- a/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md +++ b/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md @@ -89,8 +89,7 @@ Below is a comprehensive table of models supported for running leaderboard evalu |openbmb/MiniCPM3-4B 💻| Prompt| |THUDM/glm-4-9b-chat 💻| Function Calling| |Team-ACE/ToolACE-8B 💻| Function Calling| -|watt-ai/watt-tool-8B 💻| Function Calling| -|watt-ai/watt-tool-70B 💻| Function Calling| +|watt-ai/watt-tool-{8B,70B} 💻| Function Calling| --- diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index fce7ef586..d4fb64f30 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -88,7 +88,7 @@ def models(): def generate( model: List[str] = typer.Option( ["gorilla-openfunctions-v2"], - help="A list of model names to evaluate. Use commas to separate multiple models.", + help="A list of model names to generate the llm response. Use commas to separate multiple models.", callback=handle_multiple_input ), test_category: List[str] = typer.Option( @@ -216,7 +216,7 @@ def evaluate( callback=handle_multiple_input ), test_category: List[str] = typer.Option( - None, + ["all"], help="A list of test categories to run the evaluation on.", callback=handle_multiple_input ), diff --git a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py index 360ac62f7..e4f73ea5d 100644 --- a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py +++ b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py @@ -12,7 +12,6 @@ PROJECT_ROOT, PROMPT_PATH, RESULT_PATH, - TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, TEST_IDS_TO_GENERATE_PATH, ) @@ -20,8 +19,10 @@ from bfcl.model_handler.handler_map import HANDLER_MAP from bfcl.model_handler.model_style import ModelStyle from bfcl.utils import ( + check_api_key_supplied, is_executable, is_multi_turn, + parse_test_category_argument, sort_key, ) from tqdm import tqdm @@ -58,24 +59,10 @@ def build_handler(model_name, temperature): return handler -def parse_test_category_argument(test_category_args): - test_name_total = set() - test_filename_total = set() - - for test_category in test_category_args: - if test_category in TEST_COLLECTION_MAPPING: - for test_name in TEST_COLLECTION_MAPPING[test_category]: - test_name_total.add(test_name) - test_filename_total.add(TEST_FILE_MAPPING[test_name]) - else: - test_name_total.add(test_category) - test_filename_total.add(TEST_FILE_MAPPING[test_category]) - - return sorted(list(test_name_total)), sorted(list(test_filename_total)) - - def get_involved_test_entries(test_category_args, run_ids): all_test_file_paths, all_test_categories, all_test_entries_involved = [], [], [] + api_key_supplied = check_api_key_supplied() + skipped_categories = [] if run_ids: with open(TEST_IDS_TO_GENERATE_PATH) as f: test_ids_to_generate = json.load(f) @@ -84,17 +71,38 @@ def get_involved_test_entries(test_category_args, run_ids): continue test_file_path = TEST_FILE_MAPPING[category] all_test_entries_involved.extend( - [entry for entry in load_file(PROMPT_PATH / test_file_path) if entry["id"] in test_ids] + [ + entry + for entry in load_file(PROMPT_PATH / test_file_path) + if entry["id"] in test_ids + ] ) - all_test_categories.append(category) - all_test_file_paths.append(test_file_path) + # Skip executable test category if api key is not provided in the .env file + if is_executable(category) and not api_key_supplied: + skipped_categories.append(category) + else: + all_test_categories.append(category) + all_test_file_paths.append(test_file_path) else: - all_test_categories, all_test_file_paths = parse_test_category_argument(test_category_args) - for test_category, file_to_open in zip(all_test_categories, all_test_file_paths): - all_test_entries_involved.extend(load_file(PROMPT_PATH / file_to_open)) + all_test_file_paths, all_test_categories = parse_test_category_argument(test_category_args) + # Make a copy here since we are removing list elemenets inside the for loop + for test_category, file_to_open in zip( + all_test_categories[:], all_test_file_paths[:] + ): + if is_executable(test_category) and not api_key_supplied: + all_test_categories.remove(test_category) + all_test_file_paths.remove(file_to_open) + skipped_categories.append(test_category) + else: + all_test_entries_involved.extend(load_file(PROMPT_PATH / file_to_open)) - return all_test_file_paths, all_test_categories, all_test_entries_involved + return ( + all_test_file_paths, + all_test_categories, + all_test_entries_involved, + skipped_categories, + ) def collect_test_cases( @@ -263,9 +271,12 @@ def main(args): if type(args.test_category) is not list: args.test_category = [args.test_category] - all_test_file_paths, all_test_categories, all_test_entries_involved = ( - get_involved_test_entries(args.test_category, args.run_ids) - ) + ( + all_test_file_paths, + all_test_categories, + all_test_entries_involved, + skipped_categories, + ) = get_involved_test_entries(args.test_category, args.run_ids) print(f"Generating results for {args.model}") if args.run_ids: @@ -273,7 +284,15 @@ def main(args): else: print(f"Running full test cases for categories: {all_test_categories}.") + if len(skipped_categories) > 0: + print("----------") + print( + f"❗️ Note: The following executable test category entries will be skipped because they require API Keys to be provided in the .env file: {skipped_categories}.\n Please refer to the README.md 'API Keys for Executable Test Categories' section for details.\n The model response for other categories will still be generated." + ) + print("----------") + # Apply function credential config if any of the test categories are executable + # We can know for sure that any executable categories will not be included if the API Keys are not supplied. if any([is_executable(category) for category in all_test_categories]): apply_function_credential_config(input_path=PROMPT_PATH) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py index 8c2bed57d..d328a8a5a 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -481,7 +481,7 @@ def runner(model_names, test_categories, api_sanity_check, result_dir, score_dir # Find and process all JSON files in the subdirectory for model_result_json in subdir.glob("*.json"): test_category = extract_test_category(model_result_json) - if test_categories is not None and test_category not in test_categories: + if test_category not in test_categories: continue handler = get_handler(model_name_escaped) @@ -614,15 +614,8 @@ def runner(model_names, test_categories, api_sanity_check, result_dir, score_dir API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=False ) - print( - f"🏁 Evaluation completed. See {score_dir / 'data_overall.csv'} for overall evaluation results on BFCL V3." - ) - print( - f"See {score_dir / 'data_live.csv'}, {score_dir / 'data_non_live.csv'} and {score_dir / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively." - ) - -def main(model, test_category, api_sanity_check, result_dir, score_dir): +def main(model, test_categories, api_sanity_check, result_dir, score_dir): if result_dir is None: result_dir = RESULT_PATH else: @@ -633,14 +626,23 @@ def main(model, test_category, api_sanity_check, result_dir, score_dir): else: result_dir = (PROJECT_ROOT / score_dir).resolve() - test_categories = None - if test_category is not None: - test_categories = [] - for category in test_category: - if category in TEST_COLLECTION_MAPPING: - test_categories.extend(TEST_COLLECTION_MAPPING[category]) + if type(test_categories) is not list: + test_categories = [test_categories] + + _, all_test_categories = parse_test_category_argument(test_categories) + + api_key_supplied = check_api_key_supplied() + skipped_categories = [] + + for test_category in all_test_categories[:]: + # Skip executable test category evaluation if api key is not provided in the .env file + if is_executable(test_category) and not api_key_supplied: + # We can still run the REST category, since the API keys are baked in the model response. So as long as the model response is generated, we can evaluate. + if is_rest(test_category): + continue else: - test_categories.append(category) + all_test_categories.remove(test_category) + skipped_categories.append(test_category) model_names = None if model is not None: @@ -651,7 +653,22 @@ def main(model, test_category, api_sanity_check, result_dir, score_dir): # We patch it here to avoid confusing the user. model_names.append(model_name.replace("/", "_")) - runner(model_names, test_categories, api_sanity_check, result_dir, score_dir) + # Driver function to run the evaluation for all categories involved. + runner(model_names, all_test_categories, api_sanity_check, result_dir, score_dir) + + if len(skipped_categories) > 0: + print("----------") + print( + f"❗️ Note: The following executable test category are not evaluated because they require API Keys to be provided in the .env file: {skipped_categories}.\n Please refer to the README.md 'API Keys for Executable Test Categories' section for details.\n The model response for other categories are evaluated." + ) + print("----------") + + print( + f"🏁 Evaluation completed. See {score_dir / 'data_overall.csv'} for overall evaluation results on BFCL V3." + ) + print( + f"See {score_dir / 'data_live.csv'}, {score_dir / 'data_non_live.csv'} and {score_dir / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively." + ) def get_handler(model_name): @@ -671,6 +688,7 @@ def get_handler(model_name): "--test-category", nargs="+", type=str, + default="all", help="A list of test categories to run the evaluation on", ) parser.add_argument( diff --git a/berkeley-function-call-leaderboard/bfcl/utils.py b/berkeley-function-call-leaderboard/bfcl/utils.py index c85c518ef..a1fb4ccb4 100644 --- a/berkeley-function-call-leaderboard/bfcl/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/utils.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Union -from bfcl.constant import VERSION_PREFIX +from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, VERSION_PREFIX def extract_test_category(input_string: Union[str, Path]) -> str: @@ -178,3 +178,33 @@ def is_empty_output(decoded_output): if len(decoded_output) == 1 and len(decoded_output[0]) == 0: return True return False + + +def check_api_key_supplied() -> bool: + """ + This function checks if the four API Keys needed for the executable categoreis are provided. If not, those categories will be skipped. + """ + ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY") + for var in ENV_VARS: + if os.getenv(var) == "": + return False + return True + + +def parse_test_category_argument(test_category_args): + test_name_total = set() + test_filename_total = set() + + for test_category in test_category_args: + if test_category in TEST_COLLECTION_MAPPING: + for test_name in TEST_COLLECTION_MAPPING[test_category]: + test_name_total.add(test_name) + test_filename_total.add(TEST_FILE_MAPPING[test_name]) + elif test_category in TEST_FILE_MAPPING: + test_name_total.add(test_category) + test_filename_total.add(TEST_FILE_MAPPING[test_category]) + else: + # Invalid test category name + raise Exception(f"Invalid test category name provided: {test_category}") + + return sorted(list(test_filename_total)), sorted(list(test_name_total))