ShishirPatil · HuanzhiMao · Dec 21, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
 
+- [Dec 21, 2024] [#848](https://github.com/ShishirPatil/gorilla/pull/848): Improves behavior for generation and evaluation pipeline. When executable categories are involved and API keys are not provided in the `.env` file, instead of throwing an error, the affected categories will now be skipped. This enhancement provides a smoother experience for first-time users.
 - [Dec 21, 2024] [#847](https://github.com/ShishirPatil/gorilla/pull/847): Add new model `watt-ai/watt-tool-8B` and `watt-ai/watt-tool-70B` to the leaderboard.
 - [Dec 20, 2024] [#842](https://github.com/ShishirPatil/gorilla/pull/842): Add the following new models to the leaderboard:
   - `Qwen/Qwen2.5-0.5B-Instruct`

diff --git a/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md b/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md
@@ -89,8 +89,7 @@ Below is a comprehensive table of models supported for running leaderboard evalu
 |openbmb/MiniCPM3-4B 💻| Prompt|
 |THUDM/glm-4-9b-chat 💻| Function Calling|
 |Team-ACE/ToolACE-8B 💻| Function Calling|
-|watt-ai/watt-tool-8B 💻| Function Calling|
-|watt-ai/watt-tool-70B 💻| Function Calling|
+|watt-ai/watt-tool-{8B,70B} 💻| Function Calling|
 
 ---
 

diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py
@@ -88,7 +88,7 @@ def models():
 def generate(
     model: List[str] = typer.Option(
         ["gorilla-openfunctions-v2"], 
-        help="A list of model names to evaluate. Use commas to separate multiple models.",
+        help="A list of model names to generate the llm response. Use commas to separate multiple models.",
         callback=handle_multiple_input
     ),
     test_category: List[str] = typer.Option(
@@ -216,7 +216,7 @@ def evaluate(
         callback=handle_multiple_input
     ),
     test_category: List[str] = typer.Option(
-        None, 
+        ["all"], 
         help="A list of test categories to run the evaluation on.",
         callback=handle_multiple_input
     ),

diff --git a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py
@@ -12,16 +12,17 @@
     PROJECT_ROOT,
     PROMPT_PATH,
     RESULT_PATH,
-    TEST_COLLECTION_MAPPING,
     TEST_FILE_MAPPING,
     TEST_IDS_TO_GENERATE_PATH,
 )
 from bfcl.eval_checker.eval_runner_helper import load_file
 from bfcl.model_handler.handler_map import HANDLER_MAP
 from bfcl.model_handler.model_style import ModelStyle
 from bfcl.utils import (
+    check_api_key_supplied,
     is_executable,
     is_multi_turn,
+    parse_test_category_argument,
     sort_key,
 )
 from tqdm import tqdm
@@ -58,24 +59,10 @@ def build_handler(model_name, temperature):
     return handler
 
 
-def parse_test_category_argument(test_category_args):
-    test_name_total = set()
-    test_filename_total = set()
-
-    for test_category in test_category_args:
-        if test_category in TEST_COLLECTION_MAPPING:
-            for test_name in TEST_COLLECTION_MAPPING[test_category]:
-                test_name_total.add(test_name)
-                test_filename_total.add(TEST_FILE_MAPPING[test_name])
-        else:
-            test_name_total.add(test_category)
-            test_filename_total.add(TEST_FILE_MAPPING[test_category])
-
-    return sorted(list(test_name_total)), sorted(list(test_filename_total))
-
-
 def get_involved_test_entries(test_category_args, run_ids):
     all_test_file_paths, all_test_categories, all_test_entries_involved = [], [], []
+    api_key_supplied = check_api_key_supplied()
+    skipped_categories = []
     if run_ids:
         with open(TEST_IDS_TO_GENERATE_PATH) as f:
             test_ids_to_generate = json.load(f)
@@ -84,17 +71,38 @@ def get_involved_test_entries(test_category_args, run_ids):
                 continue
             test_file_path = TEST_FILE_MAPPING[category]
             all_test_entries_involved.extend(
-                [entry for entry in load_file(PROMPT_PATH / test_file_path) if entry["id"] in test_ids]
+                [
+                    entry
+                    for entry in load_file(PROMPT_PATH / test_file_path)
+                    if entry["id"] in test_ids
+                ]
             )
-            all_test_categories.append(category)
-            all_test_file_paths.append(test_file_path)
+            # Skip executable test category if api key is not provided in the .env file
+            if is_executable(category) and not api_key_supplied:
+                skipped_categories.append(category)
+            else:
+                all_test_categories.append(category)
+                all_test_file_paths.append(test_file_path)
 
     else:
-        all_test_categories, all_test_file_paths = parse_test_category_argument(test_category_args)
-        for test_category, file_to_open in zip(all_test_categories, all_test_file_paths):
-            all_test_entries_involved.extend(load_file(PROMPT_PATH / file_to_open))
+        all_test_file_paths, all_test_categories = parse_test_category_argument(test_category_args)
+        # Make a copy here since we are removing list elemenets inside the for loop
+        for test_category, file_to_open in zip(
+            all_test_categories[:], all_test_file_paths[:]
+        ):
+            if is_executable(test_category) and not api_key_supplied:
+                all_test_categories.remove(test_category)
+                all_test_file_paths.remove(file_to_open)
+                skipped_categories.append(test_category)
+            else:
+                all_test_entries_involved.extend(load_file(PROMPT_PATH / file_to_open))
 
-    return all_test_file_paths, all_test_categories, all_test_entries_involved
+    return (
+        all_test_file_paths,
+        all_test_categories,
+        all_test_entries_involved,
+        skipped_categories,
+    )
 
 
 def collect_test_cases(
@@ -263,17 +271,28 @@ def main(args):
     if type(args.test_category) is not list:
         args.test_category = [args.test_category]
 
-    all_test_file_paths, all_test_categories, all_test_entries_involved = (
-        get_involved_test_entries(args.test_category, args.run_ids)
-    )
+    (
+        all_test_file_paths,
+        all_test_categories,
+        all_test_entries_involved,
+        skipped_categories,
+    ) = get_involved_test_entries(args.test_category, args.run_ids)
 
     print(f"Generating results for {args.model}")
     if args.run_ids:
         print("Running specific test cases. Ignoring `--test-category` argument.")
     else:
         print(f"Running full test cases for categories: {all_test_categories}.")
 
+    if len(skipped_categories) > 0:
+        print("----------")
+        print(
+            f"❗️ Note: The following executable test category entries will be skipped because they require API Keys to be provided in the .env file: {skipped_categories}.\n Please refer to the README.md 'API Keys for Executable Test Categories' section for details.\n The model response for other categories will still be generated."
+        )
+        print("----------")
+
     # Apply function credential config if any of the test categories are executable
+    # We can know for sure that any executable categories will not be included if the API Keys are not supplied.
     if any([is_executable(category) for category in all_test_categories]):
         apply_function_credential_config(input_path=PROMPT_PATH)
 

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -481,7 +481,7 @@ def runner(model_names, test_categories, api_sanity_check, result_dir, score_dir
         # Find and process all JSON files in the subdirectory
         for model_result_json in subdir.glob("*.json"):
             test_category = extract_test_category(model_result_json)
-            if test_categories is not None and test_category not in test_categories:
+            if test_category not in test_categories:
                 continue
 
             handler = get_handler(model_name_escaped)
@@ -614,15 +614,8 @@ def runner(model_names, test_categories, api_sanity_check, result_dir, score_dir
         API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=False
     )
 
-    print(
-        f"🏁 Evaluation completed. See {score_dir / 'data_overall.csv'} for overall evaluation results on BFCL V3."
-    )
-    print(
-        f"See {score_dir / 'data_live.csv'}, {score_dir / 'data_non_live.csv'} and {score_dir / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively."
-    )
-
 
-def main(model, test_category, api_sanity_check, result_dir, score_dir):
+def main(model, test_categories, api_sanity_check, result_dir, score_dir):
     if result_dir is None:
         result_dir = RESULT_PATH
     else:
@@ -633,14 +626,23 @@ def main(model, test_category, api_sanity_check, result_dir, score_dir):
     else:
         result_dir = (PROJECT_ROOT / score_dir).resolve()
 
-    test_categories = None
-    if test_category is not None:
-        test_categories = []
-        for category in test_category:
-            if category in TEST_COLLECTION_MAPPING:
-                test_categories.extend(TEST_COLLECTION_MAPPING[category])
+    if type(test_categories) is not list:
+        test_categories = [test_categories]
+
+    _, all_test_categories = parse_test_category_argument(test_categories)
+
+    api_key_supplied = check_api_key_supplied()
+    skipped_categories = []
+
+    for test_category in all_test_categories[:]:
+        # Skip executable test category evaluation if api key is not provided in the .env file
+        if is_executable(test_category) and not api_key_supplied:
+            # We can still run the REST category, since the API keys are baked in the model response. So as long as the model response is generated, we can evaluate.
+            if is_rest(test_category):
+                continue
             else:
-                test_categories.append(category)
+                all_test_categories.remove(test_category)
+                skipped_categories.append(test_category)
 
     model_names = None
     if model is not None:
@@ -651,7 +653,22 @@ def main(model, test_category, api_sanity_check, result_dir, score_dir):
             # We patch it here to avoid confusing the user.
             model_names.append(model_name.replace("/", "_"))
 
-    runner(model_names, test_categories, api_sanity_check, result_dir, score_dir)
+    # Driver function to run the evaluation for all categories involved.
+    runner(model_names, all_test_categories, api_sanity_check, result_dir, score_dir)
+
+    if len(skipped_categories) > 0:
+        print("----------")
+        print(
+            f"❗️ Note: The following executable test category are not evaluated because they require API Keys to be provided in the .env file: {skipped_categories}.\n Please refer to the README.md 'API Keys for Executable Test Categories' section for details.\n The model response for other categories are evaluated."
+        )
+        print("----------")
+
+    print(
+        f"🏁 Evaluation completed. See {score_dir / 'data_overall.csv'} for overall evaluation results on BFCL V3."
+    )
+    print(
+        f"See {score_dir / 'data_live.csv'}, {score_dir / 'data_non_live.csv'} and {score_dir / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively."
+    )
 
 
 def get_handler(model_name):
@@ -671,6 +688,7 @@ def get_handler(model_name):
         "--test-category",
         nargs="+",
         type=str,
+        default="all",
         help="A list of test categories to run the evaluation on",
     )
     parser.add_argument(

diff --git a/berkeley-function-call-leaderboard/bfcl/utils.py b/berkeley-function-call-leaderboard/bfcl/utils.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import Union
 
-from bfcl.constant import VERSION_PREFIX
+from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING, VERSION_PREFIX
 
 
 def extract_test_category(input_string: Union[str, Path]) -> str:
@@ -178,3 +178,33 @@ def is_empty_output(decoded_output):
     if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
         return True
     return False
+
+
+def check_api_key_supplied() -> bool:
+    """
+    This function checks if the four API Keys needed for the executable categoreis are provided. If not, those categories will be skipped.
+    """
+    ENV_VARS = ("GEOCODE_API_KEY", "RAPID_API_KEY", "OMDB_API_KEY", "EXCHANGERATE_API_KEY")
+    for var in ENV_VARS:
+        if os.getenv(var) == "":
+            return False
+    return True
+
+
+def parse_test_category_argument(test_category_args):
+    test_name_total = set()
+    test_filename_total = set()
+
+    for test_category in test_category_args:
+        if test_category in TEST_COLLECTION_MAPPING:
+            for test_name in TEST_COLLECTION_MAPPING[test_category]:
+                test_name_total.add(test_name)
+                test_filename_total.add(TEST_FILE_MAPPING[test_name])
+        elif test_category in TEST_FILE_MAPPING:
+            test_name_total.add(test_category)
+            test_filename_total.add(TEST_FILE_MAPPING[test_category])
+        else:
+            # Invalid test category name
+            raise Exception(f"Invalid test category name provided: {test_category}")
+
+    return sorted(list(test_filename_total)), sorted(list(test_name_total))