Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BFCL] Skip Executable Categories When API Keys Missing #848

Merged
merged 6 commits into from
Dec 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions berkeley-function-call-leaderboard/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.

- [Dec 21, 2024] [#848](https://github.com/ShishirPatil/gorilla/pull/848): Improves behavior for generation and evaluation pipeline. When executable categories are involved and API keys are not provided in the `.env` file, instead of throwing an error, the affected categories will now be skipped. This enhancement provides a smoother experience for first-time users.
- [Dec 21, 2024] [#847](https://github.com/ShishirPatil/gorilla/pull/847): Add new model `watt-ai/watt-tool-8B` and `watt-ai/watt-tool-70B` to the leaderboard.
- [Dec 20, 2024] [#842](https://github.com/ShishirPatil/gorilla/pull/842): Add the following new models to the leaderboard:
- `Qwen/Qwen2.5-0.5B-Instruct`
Expand Down
3 changes: 1 addition & 2 deletions berkeley-function-call-leaderboard/SUPPORTED_MODELS.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ Below is a comprehensive table of models supported for running leaderboard evalu
|openbmb/MiniCPM3-4B 💻| Prompt|
|THUDM/glm-4-9b-chat 💻| Function Calling|
|Team-ACE/ToolACE-8B 💻| Function Calling|
|watt-ai/watt-tool-8B 💻| Function Calling|
|watt-ai/watt-tool-70B 💻| Function Calling|
|watt-ai/watt-tool-{8B,70B} 💻| Function Calling|

---

Expand Down
4 changes: 2 additions & 2 deletions berkeley-function-call-leaderboard/bfcl/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def models():
def generate(
model: List[str] = typer.Option(
["gorilla-openfunctions-v2"],
help="A list of model names to evaluate. Use commas to separate multiple models.",
help="A list of model names to generate the llm response. Use commas to separate multiple models.",
callback=handle_multiple_input
),
test_category: List[str] = typer.Option(
Expand Down Expand Up @@ -216,7 +216,7 @@ def evaluate(
callback=handle_multiple_input
),
test_category: List[str] = typer.Option(
None,
["all"],
help="A list of test categories to run the evaluation on.",
callback=handle_multiple_input
),
Expand Down
73 changes: 46 additions & 27 deletions berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@
PROJECT_ROOT,
PROMPT_PATH,
RESULT_PATH,
TEST_COLLECTION_MAPPING,
TEST_FILE_MAPPING,
TEST_IDS_TO_GENERATE_PATH,
)
from bfcl.eval_checker.eval_runner_helper import load_file
from bfcl.model_handler.handler_map import HANDLER_MAP
from bfcl.model_handler.model_style import ModelStyle
from bfcl.utils import (
check_api_key_supplied,
is_executable,
is_multi_turn,
parse_test_category_argument,
sort_key,
)
from tqdm import tqdm
Expand Down Expand Up @@ -58,24 +59,10 @@ def build_handler(model_name, temperature):
return handler


def parse_test_category_argument(test_category_args):
test_name_total = set()
test_filename_total = set()

for test_category in test_category_args:
if test_category in TEST_COLLECTION_MAPPING:
for test_name in TEST_COLLECTION_MAPPING[test_category]:
test_name_total.add(test_name)
test_filename_total.add(TEST_FILE_MAPPING[test_name])
else:
test_name_total.add(test_category)
test_filename_total.add(TEST_FILE_MAPPING[test_category])

return sorted(list(test_name_total)), sorted(list(test_filename_total))


def get_involved_test_entries(test_category_args, run_ids):
all_test_file_paths, all_test_categories, all_test_entries_involved = [], [], []
api_key_supplied = check_api_key_supplied()
skipped_categories = []
if run_ids:
with open(TEST_IDS_TO_GENERATE_PATH) as f:
test_ids_to_generate = json.load(f)
Expand All @@ -84,17 +71,38 @@ def get_involved_test_entries(test_category_args, run_ids):
continue
test_file_path = TEST_FILE_MAPPING[category]
all_test_entries_involved.extend(
[entry for entry in load_file(PROMPT_PATH / test_file_path) if entry["id"] in test_ids]
[
entry
for entry in load_file(PROMPT_PATH / test_file_path)
if entry["id"] in test_ids
]
)
all_test_categories.append(category)
all_test_file_paths.append(test_file_path)
# Skip executable test category if api key is not provided in the .env file
if is_executable(category) and not api_key_supplied:
skipped_categories.append(category)
else:
all_test_categories.append(category)
all_test_file_paths.append(test_file_path)

else:
all_test_categories, all_test_file_paths = parse_test_category_argument(test_category_args)
for test_category, file_to_open in zip(all_test_categories, all_test_file_paths):
all_test_entries_involved.extend(load_file(PROMPT_PATH / file_to_open))
all_test_file_paths, all_test_categories = parse_test_category_argument(test_category_args)
# Make a copy here since we are removing list elemenets inside the for loop
for test_category, file_to_open in zip(
all_test_categories[:], all_test_file_paths[:]
):
if is_executable(test_category) and not api_key_supplied:
all_test_categories.remove(test_category)
all_test_file_paths.remove(file_to_open)
skipped_categories.append(test_category)
else:
all_test_entries_involved.extend(load_file(PROMPT_PATH / file_to_open))

return all_test_file_paths, all_test_categories, all_test_entries_involved
return (
all_test_file_paths,
all_test_categories,
all_test_entries_involved,
skipped_categories,
)


def collect_test_cases(
Expand Down Expand Up @@ -263,17 +271,28 @@ def main(args):
if type(args.test_category) is not list:
args.test_category = [args.test_category]

all_test_file_paths, all_test_categories, all_test_entries_involved = (
get_involved_test_entries(args.test_category, args.run_ids)
)
(
all_test_file_paths,
all_test_categories,
all_test_entries_involved,
skipped_categories,
) = get_involved_test_entries(args.test_category, args.run_ids)

print(f"Generating results for {args.model}")
if args.run_ids:
print("Running specific test cases. Ignoring `--test-category` argument.")
else:
print(f"Running full test cases for categories: {all_test_categories}.")

if len(skipped_categories) > 0:
print("----------")
print(
f"❗️ Note: The following executable test category entries will be skipped because they require API Keys to be provided in the .env file: {skipped_categories}.\n Please refer to the README.md 'API Keys for Executable Test Categories' section for details.\n The model response for other categories will still be generated."
)
print("----------")

# Apply function credential config if any of the test categories are executable
# We can know for sure that any executable categories will not be included if the API Keys are not supplied.
if any([is_executable(category) for category in all_test_categories]):
apply_function_credential_config(input_path=PROMPT_PATH)

Expand Down
52 changes: 35 additions & 17 deletions berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def runner(model_names, test_categories, api_sanity_check, result_dir, score_dir
# Find and process all JSON files in the subdirectory
for model_result_json in subdir.glob("*.json"):
test_category = extract_test_category(model_result_json)
if test_categories is not None and test_category not in test_categories:
if test_category not in test_categories:
continue

handler = get_handler(model_name_escaped)
Expand Down Expand Up @@ -614,15 +614,8 @@ def runner(model_names, test_categories, api_sanity_check, result_dir, score_dir
API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=False
)

print(
f"🏁 Evaluation completed. See {score_dir / 'data_overall.csv'} for overall evaluation results on BFCL V3."
)
print(
f"See {score_dir / 'data_live.csv'}, {score_dir / 'data_non_live.csv'} and {score_dir / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively."
)


def main(model, test_category, api_sanity_check, result_dir, score_dir):
def main(model, test_categories, api_sanity_check, result_dir, score_dir):
if result_dir is None:
result_dir = RESULT_PATH
else:
Expand All @@ -633,14 +626,23 @@ def main(model, test_category, api_sanity_check, result_dir, score_dir):
else:
result_dir = (PROJECT_ROOT / score_dir).resolve()

test_categories = None
if test_category is not None:
test_categories = []
for category in test_category:
if category in TEST_COLLECTION_MAPPING:
test_categories.extend(TEST_COLLECTION_MAPPING[category])
if type(test_categories) is not list:
test_categories = [test_categories]

_, all_test_categories = parse_test_category_argument(test_categories)

api_key_supplied = check_api_key_supplied()
skipped_categories = []

for test_category in all_test_categories[:]:
# Skip executable test category evaluation if api key is not provided in the .env file
if is_executable(test_category) and not api_key_supplied:
# We can still run the REST category, since the API keys are baked in the model response. So as long as the model response is generated, we can evaluate.
if is_rest(test_category):
continue
else:
test_categories.append(category)
all_test_categories.remove(test_category)
skipped_categories.append(test_category)

model_names = None
if model is not None:
Expand All @@ -651,7 +653,22 @@ def main(model, test_category, api_sanity_check, result_dir, score_dir):
# We patch it here to avoid confusing the user.
model_names.append(model_name.replace("/", "_"))

runner(model_names, test_categories, api_sanity_check, result_dir, score_dir)
# Driver function to run the evaluation for all categories involved.
runner(model_names, all_test_categories, api_sanity_check, result_dir, score_dir)

if len(skipped_categories) > 0:
print("----------")
print(
f"❗️ Note: The following executable test category are not evaluated because they require API Keys to be provided in the .env file: {skipped_categories}.\n Please refer to the README.md 'API Keys for Executable Test Categories' section for details.\n The model response for other categories are evaluated."
)
print("----------")

print(
f"🏁 Evaluation completed. See {score_dir / 'data_overall.csv'} for overall evaluation results on BFCL V3."
)
print(
f"See {score_dir / 'data_live.csv'}, {score_dir / 'data_non_live.csv'} and {score_dir / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively."
)


def get_handler(model_name):
Expand All @@ -671,6 +688,7 @@ def get_handler(model_name):
"--test-category",
nargs="+",
type=str,
default="all",
help="A list of test categories to run the evaluation on",
)
parser.add_argument(
Expand Down
32 changes: 31 additions & 1 deletion berkeley-function-call-leaderboard/bfcl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path
from typing import Union

from bfcl.constant import VERSION_PREFIX
from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, VERSION_PREFIX


def extract_test_category(input_string: Union[str, Path]) -> str:
Expand Down Expand Up @@ -178,3 +178,33 @@ def is_empty_output(decoded_output):
if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
return True
return False


def check_api_key_supplied() -> bool:
"""
This function checks if the four API Keys needed for the executable categoreis are provided. If not, those categories will be skipped.
"""
ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY")
for var in ENV_VARS:
if os.getenv(var) == "":
return False
return True


def parse_test_category_argument(test_category_args):
test_name_total = set()
test_filename_total = set()

for test_category in test_category_args:
if test_category in TEST_COLLECTION_MAPPING:
for test_name in TEST_COLLECTION_MAPPING[test_category]:
test_name_total.add(test_name)
test_filename_total.add(TEST_FILE_MAPPING[test_name])
elif test_category in TEST_FILE_MAPPING:
test_name_total.add(test_category)
test_filename_total.add(TEST_FILE_MAPPING[test_category])
else:
# Invalid test category name
raise Exception(f"Invalid test category name provided: {test_category}")

return sorted(list(test_filename_total)), sorted(list(test_name_total))