diff --git a/data_live.csv b/data_live.csv index 311b78195..057d78acc 100644 --- a/data_live.csv +++ b/data_live.csv @@ -1,81 +1,90 @@ Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Python Parallel AST,Python Parallel Multiple AST,Irrelevance Detection,Relevance Detection -1,GPT-4o-2024-08-06 (Prompt),80.84%,74.02%,78.68%,72.46%,100.00%,75.00%,91.84%,52.94% -2,GPT-4-turbo-2024-04-09 (FC),79.56%,78.09%,81.01%,77.59%,81.25%,66.67%,81.97%,70.59% -3,GPT-4o-2024-08-06 (FC),79.29%,76.02%,76.36%,76.07%,81.25%,66.67%,84.47%,70.59% -4,Claude-3-Opus-20240229 (FC),79.24%,76.31%,79.84%,77.40%,18.75%,29.17%,83.79%,76.47% -5,ToolACE-8B (FC),78.37%,75.72%,72.48%,76.54%,81.25%,70.83%,82.43%,77.78% -6,Gemini-1.5-Flash-002 (FC),77.96%,70.91%,71.71%,70.47%,81.25%,75.00%,89.12%,58.82% -7,Claude-3.5-Sonnet-20241022 (FC),77.96%,79.50%,82.17%,81.10%,31.25%,12.50%,75.74%,70.59% -8,o1-mini-2024-09-12 (Prompt),77.73%,71.87%,72.87%,71.70%,75.00%,66.67%,87.07%,58.82% -9,Mistral-Medium-2312 (Prompt),77.20%,73.50%,74.03%,73.69%,81.25%,54.17%,83.11%,64.71% -10,GPT-4o-mini-2024-07-18 (Prompt),77.20%,77.42%,79.84%,76.73%,93.75%,70.83%,76.76%,82.35% -11,palmyra-x-004 (FC),77.16%,74.69%,75.19%,75.21%,50.00%,62.50%,81.07%,70.59% -12,Gemini-1.5-Pro-002 (Prompt),76.76%,78.61%,81.01%,77.97%,93.75%,70.83%,73.92%,76.47% -13,Gemini-1.5-Pro-001 (Prompt),76.49%,72.98%,75.58%,71.98%,93.75%,75.00%,82.31%,52.94% -14,Functionary-Medium-v3.1 (FC),76.45%,82.16%,81.78%,82.62%,68.75%,75.00%,67.80%,72.22% -15,Gemini-1.5-Pro-002 (FC),76.44%,76.31%,79.07%,75.50%,87.50%,75.00%,76.64%,76.47% -16,Gemini-1.5-Flash-001 (FC),75.51%,72.69%,72.09%,73.31%,62.50%,58.33%,80.16%,58.82% -17,Gemini-1.5-Pro-001 (FC),75.47%,70.69%,73.26%,70.18%,81.25%,58.33%,83.11%,58.82% -18,o1-preview-2024-09-12 (Prompt),75.29%,77.57%,82.17%,76.35%,81.25%,79.17%,71.54%,88.24% -19,Gemini-1.5-Flash-002 (Prompt),75.20%,74.76%,77.13%,74.26%,93.75%,58.33%,75.62%,88.24% -20,Qwen2.5-72B-Instruct (Prompt),75.03%,81.79%,84.11%,81.67%,62.50%,75.00%,64.29%,94.44% -21,GoGoAgent,74.84%,72.61%,74.81%,72.08%,81.25%,66.67%,77.89%,94.12% -22,DeepSeek-Coder-V2 (FC),73.43%,77.50%,80.62%,77.30%,50.00%,70.83%,67.01%,83.33% -23,xLAM-8x22b-r (FC),73.39%,80.46%,83.33%,80.15%,62.50%,75.00%,62.24%,88.89% -24,GPT-4o-mini-2024-07-18 (FC),73.24%,75.20%,75.19%,75.12%,87.50%,70.83%,70.07%,82.35% -25,Functionary-Small-v3.1 (FC),72.99%,77.35%,78.68%,77.49%,75.00%,58.33%,66.10%,83.33% -26,Mistral-small-2402 (FC),72.49%,68.91%,64.34%,72.17%,12.50%,12.50%,77.78%,82.35% -27,Claude-3.5-Sonnet-20241022 (Prompt),71.96%,80.90%,86.05%,80.44%,81.25%,45.83%,58.16%,76.47% -28,Hammer2.0-7b (FC),71.75%,77.28%,75.97%,77.59%,81.25%,75.00%,62.81%,94.44% -29,xLAM-8x7b-r (FC),71.08%,76.68%,72.48%,78.16%,62.50%,66.67%,62.02%,94.44% -30,claude-3.5-haiku-20241022 (Prompt),70.24%,75.20%,81.01%,73.98%,87.50%,58.33%,62.47%,77.78% -31,mistral-large-2407 (FC),69.73%,79.42%,85.66%,78.16%,68.75%,75.00%,54.76%,76.47% -32,MiniCPM3-4B-FC (FC),69.66%,65.06%,72.87%,63.63%,37.50%,62.50%,76.53%,77.78% -33,FireFunction-v1 (FC),69.56%,69.13%,68.99%,71.79%,0.00%,0.00%,69.73%,94.12% -34,xLAM-7b-r (FC),69.35%,73.35%,71.32%,74.45%,50.00%,62.50%,62.70%,94.44% -35,Open-Mixtral-8x22b (FC),68.71%,72.46%,75.19%,73.41%,6.25%,45.83%,62.70%,82.35% -36,GPT-3.5-Turbo-0125 (Prompt),68.62%,77.94%,78.29%,78.25%,75.00%,62.50%,53.85%,94.12% -37,Gemini-1.5-Flash-001 (Prompt),68.53%,75.87%,74.81%,75.78%,93.75%,79.17%,57.03%,82.35% -38,Command-R-Plus (Prompt) (Original),68.27%,76.09%,75.58%,76.26%,81.25%,70.83%,56.01%,82.35% -39,Gemini-1.0-Pro-002 (FC),68.00%,66.17%,73.26%,65.53%,37.50%,37.50%,70.63%,76.47% -40,Gemma-2-9b-it (Prompt),67.61%,73.72%,73.26%,74.26%,56.25%,66.67%,58.05%,77.78% -41,Qwen2.5-7B-Instruct (Prompt),66.95%,74.24%,74.81%,74.45%,62.50%,66.67%,55.44%,83.33% -42,Claude-3-Opus-20240229 (Prompt),66.80%,79.27%,84.11%,78.73%,75.00%,54.17%,47.39%,82.35% -43,GLM-4-9b-Chat (FC),66.50%,63.58%,71.32%,64.10%,0.00%,0.00%,70.98%,66.67% -44,FireFunction-v2 (FC),66.44%,75.20%,76.74%,75.50%,56.25%,58.33%,52.61%,88.24% -45,Gemma-2-27b-it (Prompt),66.15%,78.61%,83.33%,78.06%,68.75%,58.33%,46.60%,88.89% -46,Open-Mixtral-8x22b (Prompt),65.82%,74.46%,80.62%,72.84%,81.25%,75.00%,52.27%,82.35% -47,Open-Mistral-Nemo-2407 (FC),65.16%,69.73%,75.19%,68.28%,75.00%,70.83%,58.16%,64.71% -48,Meta-Llama-3-70B-Instruct (Prompt),65.04%,78.83%,81.01%,78.54%,75.00%,70.83%,43.31%,94.44% -49,Hammer2.0-1.5b (FC),64.86%,69.43%,74.03%,68.47%,56.25%,70.83%,57.48%,83.33% -50,Hermes-2-Pro-Llama-3-8B (FC),64.59%,65.95%,69.77%,65.53%,56.25%,50.00%,62.93%,44.44% -51,Claude-3-Haiku-20240307 (Prompt),64.22%,74.17%,77.13%,74.17%,56.25%,54.17%,48.87%,70.59% -52,GPT-4-turbo-2024-04-09 (Prompt),63.56%,84.68%,86.05%,84.24%,100.00%,79.17%,30.50%,100.00% -53,GPT-3.5-Turbo-0125 (FC),62.98%,77.50%,77.91%,78.35%,50.00%,54.17%,40.14%,94.12% -54,Llama-3.1-70B-Instruct (Prompt),62.02%,76.17%,77.52%,75.97%,87.50%,62.50%,39.68%,94.44% -55,Llama-3.1-8B-Instruct (Prompt),60.68%,71.95%,73.26%,72.36%,56.25%,50.00%,43.20%,72.22% -56,DBRX-Instruct (Prompt),60.58%,73.65%,77.52%,73.31%,75.00%,45.83%,39.91%,94.12% -57,Open-Mixtral-8x7b (Prompt),60.53%,64.03%,60.85%,65.05%,68.75%,50.00%,54.65%,88.24% -58,Qwen2.5-1.5B-Instruct (Prompt),60.46%,60.25%,68.60%,58.50%,56.25%,50.00%,60.43%,77.78% -59,Claude-3-Haiku-20240307 (FC),59.51%,75.80%,79.07%,77.87%,0.00%,0.00%,33.79%,100.00% -60,Granite-20b-FunctionCalling (FC),59.22%,57.66%,67.44%,55.56%,43.75%,54.17%,61.00%,88.89% -61,Command-R-Plus (FC) (Original),58.89%,62.69%,68.60%,61.82%,50.00%,45.83%,52.27%,100.00% -62,Mistral-Small-2402 (Prompt),58.18%,56.70%,34.50%,64.20%,0.00%,4.17%,60.43%,58.82% -63,Hermes-2-Pro-Mistral-7B (FC),57.49%,61.07%,67.44%,60.11%,50.00%,41.67%,51.81%,66.67% -64,Llama-3.2-3B-Instruct (Prompt),55.53%,63.29%,63.18%,64.39%,18.75%,45.83%,43.08%,83.33% -65,Nexusflow-Raven-v2 (FC),54.22%,39.45%,41.47%,38.75%,56.25%,37.50%,76.76%,58.82% -66,MiniCPM3-4B (Prompt),54.20%,36.64%,45.35%,34.19%,43.75%,45.83%,81.07%,55.56% -67,xLAM-7b-fc-r (FC),54.02%,60.47%,78.29%,57.36%,31.25%,25.00%,43.65%,77.78% -68,Hammer2.0-0.5b (FC),53.22%,45.82%,51.94%,44.25%,56.25%,41.67%,64.17%,72.22% -69,mistral-large-2407 (Prompt),52.62%,82.83%,86.05%,81.96%,93.75%,79.17%,5.44%,100.00% -70,Qwen2-7B-Instruct (Prompt),50.56%,60.47%,56.20%,61.73%,37.50%,66.67%,34.69%,83.33% -71,Gemini-1.0-Pro-002 (Prompt),48.80%,46.85%,48.06%,46.53%,62.50%,37.50%,51.13%,82.35% -72,Open-Mistral-Nemo-2407 (Prompt),48.67%,74.17%,77.13%,73.31%,87.50%,70.83%,8.73%,94.12% -73,Meta-Llama-3-8B-Instruct (Prompt),47.76%,60.47%,59.30%,61.73%,37.50%,33.33%,27.66%,77.78% -74,Llama-3.1-70B-Instruct (FC),45.27%,51.96%,51.94%,52.90%,31.25%,25.00%,33.90%,100.00% -75,Gemma-2-2b-it (Prompt),43.40%,19.47%,26.74%,18.42%,0.00%,0.00%,80.16%,38.89% -76,DeepSeek-Coder-V2-Lite-Instruct (FC),39.63%,3.55%,1.94%,3.70%,6.25%,12.50%,95.58%,5.56% -77,Qwen2-1.5B-Instruct (Prompt),38.34%,40.49%,47.67%,39.41%,18.75%,25.00%,34.13%,83.33% -78,xLAM-1b-fc-r (FC),37.54%,54.33%,65.89%,53.56%,0.00%,0.00%,10.54%,100.00% -79,Llama-3.1-8B-Instruct (FC),33.19%,48.56%,50.00%,48.62%,37.50%,37.50%,8.39%,94.44% -80,Llama-3.2-1B-Instruct (Prompt),31.36%,11.92%,30.62%,7.50%,12.50%,4.17%,61.11%,33.33% \ No newline at end of file +1,GPT-4-turbo-2024-04-09 (FC),80.45%,79.42%,83.33%,78.63%,81.25%,70.83%,82.20%,72.22% +2,o1-2024-12-17 (Prompt),80.45%,77.50%,81.78%,76.54%,81.25%,70.83%,85.15%,72.22% +3,gpt-4o-2024-11-20 (Prompt),79.65%,80.46%,83.72%,79.77%,87.50%,70.83%,78.34%,83.33% +4,gpt-4o-2024-11-20 (FC),79.61%,79.27%,81.01%,78.82%,87.50%,75.00%,80.05%,83.33% +5,Claude-3.5-Sonnet-20241022 (FC),78.85%,80.46%,83.33%,81.96%,25.00%,20.83%,76.42%,77.78% +6,ToolACE-8B (FC),78.50%,75.87%,72.48%,76.73%,81.25%,70.83%,82.43%,83.33% +7,o1-mini-2024-09-12 (Prompt),78.05%,71.80%,71.71%,71.60%,75.00%,79.17%,87.98%,61.11% +8,Gemini-1.5-Flash-002 (FC),77.97%,70.84%,72.09%,70.18%,81.25%,79.17%,89.34%,55.56% +9,Claude-3-Opus-20240229 (FC),77.92%,74.98%,77.91%,75.78%,31.25%,37.50%,82.77%,61.11% +10,o1-2024-12-17 (FC),77.92%,77.05%,81.01%,79.01%,0.00%,0.00%,79.37%,72.22% +11,watt-tool-70B (FC),77.65%,83.42%,84.88%,83.48%,81.25%,66.67%,68.48%,94.44% +12,Mistral-Medium-2312 (Prompt),77.52%,74.02%,75.19%,74.07%,81.25%,54.17%,83.11%,66.67% +13,Gemini-1.5-Pro-001 (Prompt),76.63%,73.06%,75.97%,71.98%,93.75%,75.00%,82.54%,55.56% +14,Functionary-Medium-v3.1 (FC),76.59%,82.53%,81.01%,83.29%,68.75%,75.00%,67.57%,72.22% +15,Gemini-1.5-Flash-002 (Prompt),76.54%,76.98%,80.62%,76.16%,93.75%,62.50%,75.74%,83.33% +16,Gemini-1.5-Pro-002 (Prompt),76.54%,78.39%,81.78%,77.40%,87.50%,79.17%,73.81%,72.22% +17,watt-tool-8B (FC),76.37%,77.13%,75.97%,77.49%,87.50%,66.67%,75.06%,83.33% +18,GPT-4o-mini-2024-07-18 (Prompt),76.32%,77.57%,80.23%,76.73%,93.75%,75.00%,74.26%,83.33% +19,Gemini-1.5-Flash-001 (FC),76.28%,74.02%,75.19%,74.26%,62.50%,58.33%,80.27%,50.00% +20,Gemini-1.5-Pro-001 (FC),76.23%,71.65%,75.58%,70.75%,81.25%,62.50%,83.79%,50.00% +21,Gemini-1.5-Pro-002 (FC),76.19%,76.17%,79.46%,75.21%,87.50%,75.00%,76.30%,72.22% +22,Qwen2.5-72B-Instruct (Prompt),75.21%,82.24%,84.50%,82.15%,62.50%,75.00%,63.95%,100.00% +23,xLAM-7b-r (FC),75.08%,73.72%,71.32%,74.93%,50.00%,62.50%,86.72%,94.44% +24,Hammer2.1-7b (FC),75.02%,77.05%,76.36%,77.40%,81.25%,66.67%,71.77%,82.35% +25,GPT-4o-mini-2024-07-18 (FC),74.37%,76.61%,78.29%,76.16%,87.50%,70.83%,70.75%,83.33% +26,Qwen2.5-32B-Instruct (Prompt),74.14%,78.68%,82.17%,78.54%,62.50%,58.33%,66.67%,100.00% +27,Qwen2.5-14B-Instruct (Prompt),74.10%,75.13%,74.03%,75.78%,62.50%,66.67%,72.45%,77.78% +28,GoGoAgent,73.92%,74.54%,72.09%,75.40%,68.75%,66.67%,72.90%,77.78% +29,Hammer2.1-3b (FC),73.91%,72.83%,72.48%,73.31%,62.50%,62.50%,75.40%,82.35% +30,Functionary-Small-v3.1 (FC),73.66%,78.09%,79.07%,78.16%,81.25%,62.50%,66.78%,77.78% +31,DeepSeek-Coder-V2 (FC),73.43%,77.13%,80.23%,77.02%,43.75%,70.83%,67.46%,88.89% +32,xLAM-8x22b-r (FC),72.55%,79.57%,79.46%,79.68%,81.25%,75.00%,61.45%,88.89% +33,claude-3.5-haiku-20241022 (FC),72.28%,76.98%,82.17%,78.35%,18.75%,0.00%,64.85%,83.33% +34,Mistral-small-2402 (FC),72.10%,68.47%,64.73%,71.51%,12.50%,12.50%,77.55%,77.78% +35,Claude-3.5-Sonnet-20241022 (Prompt),71.88%,80.61%,86.05%,80.06%,81.25%,45.83%,58.39%,77.78% +36,xLAM-8x7b-r (FC),70.99%,77.50%,74.03%,79.30%,43.75%,58.33%,60.54%,94.44% +37,claude-3.5-haiku-20241022 (Prompt),70.64%,76.46%,83.72%,75.02%,87.50%,54.17%,61.56%,77.78% +38,Hammer2.1-1.5b (FC),70.59%,69.65%,70.93%,69.80%,50.00%,62.50%,71.88%,77.78% +39,FireFunction-v1 (FC),70.41%,70.47%,71.32%,72.93%,0.00%,0.00%,69.84%,94.44% +40,MiniCPM3-4B-FC (FC),69.97%,65.66%,74.42%,63.91%,43.75%,62.50%,76.53%,72.22% +41,mistral-large-2407 (FC),69.84%,79.57%,84.88%,78.54%,62.50%,79.17%,54.88%,72.22% +42,Gemini-1.0-Pro-002 (FC),69.57%,68.69%,77.13%,67.62%,43.75%,41.67%,70.98%,66.67% +43,Command R7B (FC),69.21%,59.66%,63.18%,58.69%,56.25%,66.67%,84.13%,55.56% +44,Gemini-1.5-Flash-001 (Prompt),68.86%,76.54%,76.74%,76.16%,93.75%,79.17%,56.80%,83.33% +45,Open-Mixtral-8x22b (FC),68.55%,72.46%,76.36%,73.12%,6.25%,45.83%,62.24%,83.33% +46,GPT-3.5-Turbo-0125 (Prompt),68.46%,78.46%,79.84%,78.63%,75.00%,58.33%,52.61%,94.44% +47,DeepSeek-V3 (FC),68.33%,81.94%,82.95%,82.15%,81.25%,62.50%,47.05%,88.89% +48,Gemma-2-9b-it (Prompt),67.84%,74.32%,76.36%,74.26%,62.50%,62.50%,57.60%,83.33% +49,Qwen2.5-7B-Instruct (Prompt),67.35%,74.91%,75.97%,74.93%,62.50%,70.83%,55.33%,88.89% +50,Gemma-2-27b-it (Prompt),67.04%,79.94%,84.50%,79.39%,68.75%,62.50%,46.71%,94.44% +51,Claude-3-Opus-20240229 (Prompt),66.86%,79.50%,84.11%,79.11%,68.75%,54.17%,47.17%,83.33% +52,GLM-4-9b-Chat (FC),66.77%,63.95%,72.09%,64.39%,0.00%,0.00%,71.09%,66.67% +53,Open-Mixtral-8x22b (Prompt),65.93%,74.61%,82.17%,72.65%,81.25%,75.00%,52.27%,83.33% +54,Open-Mistral-Nemo-2407 (FC),65.93%,71.06%,77.13%,69.61%,75.00%,66.67%,58.05%,66.67% +55,FireFunction-v2 (FC),65.57%,77.94%,78.29%,78.35%,56.25%,70.83%,46.03%,94.44% +56,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.19%,72.27%,62.50%,66.67%,53.06%,70.59% +57,Hermes-2-Pro-Llama-3-8B (FC),64.90%,66.54%,71.71%,65.81%,56.25%,50.00%,62.81%,44.44% +58,Meta-Llama-3-70B-Instruct (Prompt),64.90%,78.46%,80.62%,78.25%,75.00%,66.67%,43.42%,100.00% +59,GPT-3.5-Turbo-0125 (FC),63.93%,79.05%,80.62%,79.68%,43.75%,58.33%,40.14%,94.44% +60,GPT-4-turbo-2024-04-09 (Prompt),63.71%,84.75%,87.21%,84.14%,100.00%,75.00%,30.73%,100.00% +61,Hammer2.1-0.5b (FC),62.86%,58.03%,59.69%,58.02%,50.00%,45.83%,69.95%,77.78% +62,Llama-3.3-70B-Instruct (Prompt),62.59%,77.72%,80.62%,77.11%,93.75%,62.50%,38.66%,100.00% +63,Llama-3.1-70B-Instruct (Prompt),62.06%,76.24%,77.13%,76.16%,87.50%,62.50%,39.57%,100.00% +64,Open-Mixtral-8x7b (Prompt),61.39%,65.28%,63.18%,66.10%,68.75%,50.00%,54.88%,88.89% +65,Qwen2.5-1.5B-Instruct (Prompt),61.04%,60.99%,70.16%,59.26%,56.25%,41.67%,60.66%,83.33% +66,Llama-3.1-8B-Instruct (Prompt),60.95%,72.69%,73.26%,73.31%,56.25%,50.00%,42.63%,77.78% +67,DBRX-Instruct (Prompt),60.15%,73.28%,77.13%,73.03%,75.00%,41.67%,39.34%,94.44% +68,Granite-20b-FunctionCalling (FC),59.57%,58.33%,67.83%,56.32%,43.75%,54.17%,60.88%,88.89% +69,Command-R-Plus (FC),58.91%,60.70%,69.77%,58.78%,62.50%,45.83%,55.90%,72.22% +70,Mistral-Small-2402 (Prompt),58.73%,57.88%,36.05%,65.24%,0.00%,8.33%,60.32%,44.44% +71,Qwen2.5-3B-Instruct (Prompt),58.60%,66.77%,68.99%,66.48%,56.25%,62.50%,45.46%,88.89% +72,Hermes-2-Pro-Mistral-7B (FC),57.62%,61.21%,68.99%,60.02%,43.75%,41.67%,51.93%,66.67% +73,Llama-3.2-3B-Instruct (Prompt),55.75%,63.66%,63.57%,64.86%,12.50%,45.83%,42.97%,88.89% +74,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00% +75,Nexusflow-Raven-v2 (FC),54.15%,39.38%,41.47%,38.65%,56.25%,37.50%,76.64%,61.11% +76,xLAM-7b-fc-r (FC),53.35%,60.99%,78.29%,58.02%,31.25%,25.00%,41.16%,77.78% +77,mistral-large-2407 (Prompt),52.69%,82.68%,85.27%,81.96%,93.75%,79.17%,5.78%,100.00% +78,Qwen2-7B-Instruct (Prompt),50.60%,60.77%,56.59%,62.01%,37.50%,66.67%,34.24%,88.89% +79,Gemini-1.0-Pro-002 (Prompt),49.09%,47.52%,50.39%,47.01%,62.50%,29.17%,50.91%,77.78% +80,Open-Mistral-Nemo-2407 (Prompt),48.96%,74.98%,77.13%,74.45%,87.50%,66.67%,8.28%,88.89% +81,Meta-Llama-3-8B-Instruct (Prompt),47.93%,60.55%,60.85%,61.44%,37.50%,33.33%,28.00%,77.78% +82,Llama-3.1-70B-Instruct (FC),44.96%,51.74%,51.94%,52.61%,31.25%,25.00%,33.45%,100.00% +83,Gemma-2-2b-it (Prompt),43.76%,19.47%,26.36%,18.52%,0.00%,0.00%,81.07%,38.89% +84,DeepSeek-Coder-V2-Lite-Instruct (FC),39.40%,3.55%,2.33%,3.80%,0.00%,8.33%,95.12%,0.00% +85,Qwen2-1.5B-Instruct (Prompt),39.00%,41.23%,48.45%,40.27%,12.50%,25.00%,34.47%,94.44% +86,xLAM-1b-fc-r (FC),36.92%,53.89%,63.95%,53.37%,6.25%,0.00%,9.64%,100.00% +87,Llama-3.1-8B-Instruct (FC),33.45%,49.22%,51.55%,49.00%,37.50%,41.67%,8.05%,94.44% +88,Qwen2.5-0.5B-Instruct (Prompt),31.59%,38.34%,53.88%,34.76%,56.25%,16.67%,19.95%,94.44% +89,Llama-3.2-1B-Instruct (Prompt),31.36%,12.14%,31.40%,7.60%,12.50%,4.17%,60.66%,38.89% \ No newline at end of file diff --git a/data_multi_turn.csv b/data_multi_turn.csv index f06bebf3b..5c7aad59b 100644 --- a/data_multi_turn.csv +++ b/data_multi_turn.csv @@ -1,81 +1,90 @@ Rank,Model,Multi Turn Overall Acc,Base,Miss Func,Miss Param,Long Context -1,Claude-3.5-Sonnet-20241022 (FC),41.00%,55.00%,19.00%,42.50%,47.50% -2,GPT-4o-2024-08-06 (FC),39.12%,58.00%,10.00%,37.00%,51.50% -3,GPT-4-turbo-2024-04-09 (FC),38.12%,54.00%,13.50%,35.50%,49.50% -4,GPT-4o-2024-08-06 (Prompt),37.25%,44.00%,31.50%,29.50%,44.00% -5,o1-preview-2024-09-12 (Prompt),36.88%,47.50%,38.50%,31.50%,30.00% -6,GPT-4o-mini-2024-07-18 (FC),34.12%,47.50%,19.50%,29.00%,40.50% -7,Claude-3-Opus-20240229 (FC),30.25%,41.50%,14.00%,33.50%,32.00% -8,GPT-4-turbo-2024-04-09 (Prompt),30.25%,42.50%,25.00%,20.50%,33.00% -9,o1-mini-2024-09-12 (Prompt),28.25%,40.50%,5.00%,34.50%,33.00% -10,Claude-3-Haiku-20240307 (FC),24.50%,35.50%,11.50%,22.00%,29.00% -11,mistral-large-2407 (FC),23.75%,33.50%,18.00%,23.50%,20.00% -12,GPT-4o-mini-2024-07-18 (Prompt),22.00%,33.00%,12.00%,17.00%,26.00% -13,Gemini-1.5-Pro-002 (FC),21.62%,31.00%,5.00%,21.00%,29.50% -14,Functionary-Medium-v3.1 (FC),21.38%,31.50%,21.00%,26.50%,6.50% -15,Gemini-1.5-Pro-002 (Prompt),20.75%,23.00%,19.50%,17.50%,23.00% -16,GPT-3.5-Turbo-0125 (FC),19.50%,32.50%,11.50%,21.50%,12.50% -17,Gemini-1.5-Flash-001 (Prompt),19.50%,27.50%,20.00%,12.00%,18.50% -18,Gemini-1.5-Pro-001 (Prompt),18.88%,26.00%,5.00%,21.50%,23.00% -19,Qwen2.5-72B-Instruct (Prompt),17.25%,23.50%,20.00%,13.50%,12.00% -20,xLAM-8x22b-r (FC),16.25%,25.50%,16.00%,11.50%,12.00% -21,Gemini-1.5-Pro-001 (FC),16.00%,24.50%,3.00%,15.50%,21.00% -22,xLAM-8x7b-r (FC),15.50%,26.00%,13.00%,11.50%,11.50% -23,Gemini-1.5-Flash-001 (FC),13.87%,19.00%,3.50%,14.00%,19.00% -24,Gemini-1.5-Flash-002 (Prompt),12.50%,17.50%,6.00%,11.50%,15.00% -25,Llama-3.1-70B-Instruct (Prompt),12.38%,16.50%,13.00%,10.50%,9.50% -26,Gemini-1.5-Flash-002 (FC),11.62%,19.00%,0.50%,10.50%,16.50% -27,palmyra-x-004 (FC),11.37%,12.00%,2.50%,18.50%,12.50% -28,xLAM-7b-r (FC),10.00%,16.50%,8.50%,7.50%,7.50% -29,Functionary-Small-v3.1 (FC),9.88%,17.00%,2.50%,14.00%,6.00% -30,claude-3.5-haiku-20241022 (Prompt),9.75%,16.00%,0.50%,8.00%,14.50% -31,Llama-3.1-8B-Instruct (Prompt),9.25%,12.00%,10.00%,7.00%,8.00% -32,Open-Mistral-Nemo-2407 (FC),9.12%,15.00%,3.50%,9.00%,9.00% -33,FireFunction-v2 (FC),8.62%,13.50%,7.00%,11.00%,3.00% -34,mistral-large-2407 (Prompt),8.38%,15.00%,6.00%,6.00%,6.50% -35,ToolACE-8B (FC),7.75%,7.50%,11.50%,5.00%,7.00% -36,Qwen2.5-7B-Instruct (Prompt),7.62%,9.50%,8.50%,7.00%,5.50% -37,Claude-3.5-Sonnet-20241022 (Prompt),7.50%,9.00%,5.50%,5.00%,10.50% -38,Claude-3-Opus-20240229 (Prompt),7.13%,11.50%,2.50%,6.00%,8.50% -39,Meta-Llama-3-70B-Instruct (Prompt),5.62%,10.00%,4.00%,6.00%,2.50% -40,GPT-3.5-Turbo-0125 (Prompt),5.62%,9.00%,2.00%,7.00%,4.50% -41,Hammer2.0-7b (FC),5.50%,9.00%,2.00%,7.00%,4.00% -42,Llama-3.1-8B-Instruct (FC),5.38%,5.00%,7.50%,5.00%,4.00% -43,Llama-3.2-3B-Instruct (Prompt),5.25%,8.50%,2.50%,4.50%,5.50% -44,Llama-3.1-70B-Instruct (FC),4.88%,7.00%,4.00%,4.50%,4.00% -45,DeepSeek-Coder-V2 (FC),4.50%,7.50%,3.00%,4.00%,3.50% -46,GLM-4-9b-Chat (FC),3.50%,3.50%,4.00%,2.50%,4.00% -47,Granite-20b-FunctionCalling (FC),3.38%,6.00%,1.50%,4.50%,1.50% -48,Qwen2-7B-Instruct (Prompt),3.25%,4.00%,4.50%,2.50%,2.00% -49,Gemini-1.0-Pro-002 (FC),2.88%,4.50%,1.00%,3.50%,2.50% -50,Hermes-2-Pro-Mistral-7B (FC),2.63%,3.50%,4.00%,2.50%,0.50% -51,Mistral-small-2402 (FC),2.62%,4.50%,0.00%,3.00%,3.00% -52,MiniCPM3-4B-FC (FC),2.62%,5.00%,1.00%,3.00%,1.50% -53,FireFunction-v1 (FC),2.38%,5.00%,0.00%,2.00%,2.50% -54,Hermes-2-Pro-Llama-3-8B (FC),2.38%,4.50%,1.50%,2.00%,1.50% -55,Gemma-2-27b-it (Prompt),2.38%,4.50%,2.00%,1.50%,1.50% -56,MiniCPM3-4B (Prompt),2.00%,3.00%,3.50%,1.00%,0.50% -57,Command-R-Plus (FC) (Original),2.00%,3.50%,0.00%,1.50%,3.00% -58,Hammer2.0-1.5b (FC),1.75%,2.00%,1.00%,1.50%,2.50% -59,Claude-3-Haiku-20240307 (Prompt),1.62%,3.50%,0.00%,0.00%,3.00% -60,Gemma-2-9b-it (Prompt),1.62%,2.00%,4.00%,0.50%,0.00% -61,Open-Mixtral-8x22b (FC),1.50%,3.50%,0.00%,1.00%,1.50% -62,Open-Mixtral-8x7b (Prompt),1.50%,2.50%,0.00%,1.50%,2.00% -63,Gemini-1.0-Pro-002 (Prompt),1.38%,2.50%,1.50%,0.50%,1.00% -64,Qwen2.5-1.5B-Instruct (Prompt),1.12%,1.50%,2.50%,0.50%,0.00% -65,Nexusflow-Raven-v2 (FC),1.00%,1.50%,0.50%,1.00%,1.00% -66,GoGoAgent,1.00%,1.50%,2.00%,0.50%,0.00% -67,Meta-Llama-3-8B-Instruct (Prompt),0.75%,1.50%,0.00%,1.00%,0.50% -68,Mistral-Small-2402 (Prompt),0.75%,0.50%,0.00%,1.50%,1.00% -69,Hammer2.0-0.5b (FC),0.50%,0.50%,0.00%,0.50%,1.00% -70,Open-Mixtral-8x22b (Prompt),0.50%,1.00%,0.00%,0.00%,1.00% -71,Qwen2-1.5B-Instruct (Prompt),0.50%,0.50%,1.00%,0.00%,0.50% -72,Mistral-Medium-2312 (Prompt),0.38%,1.00%,0.00%,0.00%,0.50% -73,Command-R-Plus (Prompt) (Original),0.38%,1.00%,0.00%,0.00%,0.50% -74,Open-Mistral-Nemo-2407 (Prompt),0.25%,0.50%,0.00%,0.00%,0.50% -75,DeepSeek-Coder-V2-Lite-Instruct (FC),0.12%,0.50%,0.00%,0.00%,0.00% -76,xLAM-1b-fc-r (FC),0.12%,0.50%,0.00%,0.00%,0.00% -77,DBRX-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% -78,Gemma-2-2b-it (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% -79,Llama-3.2-1B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% -80,xLAM-7b-fc-r (FC),0.00%,0.00%,0.00%,0.00%,0.00% \ No newline at end of file +1,watt-tool-70B (FC),58.62%,67.00%,57.50%,48.50%,61.50% +2,gpt-4o-2024-11-20 (Prompt),47.62%,59.00%,41.00%,35.50%,55.00% +3,Claude-3.5-Sonnet-20241022 (FC),41.00%,55.00%,19.00%,42.50%,47.50% +4,gpt-4o-2024-11-20 (FC),41.00%,62.50%,6.00%,37.50%,58.00% +5,o1-2024-12-17 (FC),41.00%,52.50%,38.00%,30.50%,43.00% +6,claude-3.5-haiku-20241022 (FC),40.00%,54.50%,26.50%,35.00%,44.00% +7,watt-tool-8B (FC),39.13%,47.00%,41.50%,27.50%,40.50% +8,GPT-4-turbo-2024-04-09 (FC),38.12%,54.00%,13.50%,35.50%,49.50% +9,o1-2024-12-17 (Prompt),36.00%,50.50%,0.50%,48.50%,44.50% +10,GPT-4o-mini-2024-07-18 (FC),34.12%,47.50%,19.50%,29.00%,40.50% +11,Claude-3-Opus-20240229 (FC),30.25%,41.50%,14.00%,33.50%,32.00% +12,GPT-4-turbo-2024-04-09 (Prompt),30.25%,42.50%,25.00%,20.50%,33.00% +13,o1-mini-2024-09-12 (Prompt),28.25%,40.50%,5.00%,34.50%,33.00% +14,mistral-large-2407 (FC),23.75%,33.50%,18.00%,23.50%,20.00% +15,Hammer2.1-7b (FC),23.50%,35.50%,25.50%,19.00%,14.00% +16,GPT-4o-mini-2024-07-18 (Prompt),22.00%,33.00%,12.00%,17.00%,26.00% +17,Gemini-1.5-Pro-002 (FC),21.62%,31.00%,5.00%,21.00%,29.50% +18,Functionary-Medium-v3.1 (FC),21.38%,31.50%,21.00%,26.50%,6.50% +19,Gemini-1.5-Pro-002 (Prompt),20.75%,23.00%,19.50%,17.50%,23.00% +20,GPT-3.5-Turbo-0125 (FC),19.50%,32.50%,11.50%,21.50%,12.50% +21,Gemini-1.5-Flash-001 (Prompt),19.50%,27.50%,20.00%,12.00%,18.50% +22,Gemini-1.5-Pro-001 (Prompt),18.88%,26.00%,5.00%,21.50%,23.00% +23,DeepSeek-V3 (FC),18.62%,21.00%,20.50%,19.00%,14.00% +24,Qwen2.5-72B-Instruct (Prompt),18.00%,24.50%,20.00%,15.50%,12.00% +25,Qwen2.5-32B-Instruct (Prompt),17.75%,25.00%,20.00%,15.00%,11.00% +26,Hammer2.1-3b (FC),17.38%,27.50%,17.50%,14.50%,10.00% +27,xLAM-8x22b-r (FC),16.25%,25.50%,16.00%,11.50%,12.00% +28,Gemini-1.5-Pro-001 (FC),16.00%,24.50%,3.00%,15.50%,21.00% +29,xLAM-8x7b-r (FC),15.50%,26.00%,13.00%,11.50%,11.50% +30,Gemini-1.5-Flash-001 (FC),13.87%,19.00%,3.50%,14.00%,19.00% +31,Command-R-Plus (FC),13.12%,16.50%,10.00%,9.00%,17.00% +32,Gemini-1.5-Flash-002 (Prompt),12.50%,17.50%,6.00%,11.50%,15.00% +33,Llama-3.1-70B-Instruct (Prompt),12.38%,16.50%,13.00%,10.50%,9.50% +34,Qwen2.5-14B-Instruct (Prompt),12.12%,18.50%,11.50%,12.00%,6.50% +35,Gemini-1.5-Flash-002 (FC),11.62%,19.00%,0.50%,10.50%,16.50% +36,Ministral-8B-Instruct-2410 (FC),11.25%,21.00%,8.50%,10.00%,5.50% +37,Hammer2.1-1.5b (FC),10.50%,14.50%,12.50%,9.00%,6.00% +38,xLAM-7b-r (FC),10.00%,16.50%,8.50%,7.50%,7.50% +39,Functionary-Small-v3.1 (FC),9.88%,17.00%,2.50%,14.00%,6.00% +40,claude-3.5-haiku-20241022 (Prompt),9.75%,16.00%,0.50%,8.00%,14.50% +41,Llama-3.1-8B-Instruct (Prompt),9.25%,12.00%,10.00%,7.00%,8.00% +42,Open-Mistral-Nemo-2407 (FC),9.12%,15.00%,3.50%,9.00%,9.00% +43,FireFunction-v2 (FC),8.62%,13.50%,7.00%,11.00%,3.00% +44,mistral-large-2407 (Prompt),8.38%,15.00%,6.00%,6.00%,6.50% +45,ToolACE-8B (FC),7.75%,7.50%,11.50%,5.00%,7.00% +46,Qwen2.5-7B-Instruct (Prompt),7.62%,9.50%,8.50%,7.00%,5.50% +47,Claude-3.5-Sonnet-20241022 (Prompt),7.50%,9.00%,5.50%,5.00%,10.50% +48,Claude-3-Opus-20240229 (Prompt),7.13%,11.50%,2.50%,6.00%,8.50% +49,Llama-3.3-70B-Instruct (Prompt),6.87%,9.00%,8.00%,4.50%,6.00% +50,Meta-Llama-3-70B-Instruct (Prompt),5.62%,10.00%,4.00%,6.00%,2.50% +51,GPT-3.5-Turbo-0125 (Prompt),5.62%,9.00%,2.00%,7.00%,4.50% +52,Llama-3.1-8B-Instruct (FC),5.38%,5.00%,7.50%,5.00%,4.00% +53,Llama-3.2-3B-Instruct (Prompt),5.25%,8.50%,2.50%,4.50%,5.50% +54,Command R7B (FC),5.00%,6.50%,1.50%,6.50%,5.50% +55,Llama-3.1-70B-Instruct (FC),4.88%,7.00%,4.00%,4.50%,4.00% +56,DeepSeek-Coder-V2 (FC),4.50%,7.50%,3.00%,4.00%,3.50% +57,GLM-4-9b-Chat (FC),3.50%,3.50%,4.00%,2.50%,4.00% +58,Granite-20b-FunctionCalling (FC),3.38%,6.00%,1.50%,4.50%,1.50% +59,Qwen2.5-3B-Instruct (Prompt),3.38%,5.50%,3.50%,2.00%,2.50% +60,Qwen2-7B-Instruct (Prompt),3.25%,4.00%,4.50%,2.50%,2.00% +61,Gemini-1.0-Pro-002 (FC),2.88%,4.50%,1.00%,3.50%,2.50% +62,Hermes-2-Pro-Mistral-7B (FC),2.63%,3.50%,4.00%,2.50%,0.50% +63,Mistral-small-2402 (FC),2.62%,4.50%,0.00%,3.00%,3.00% +64,MiniCPM3-4B-FC (FC),2.62%,5.00%,1.00%,3.00%,1.50% +65,FireFunction-v1 (FC),2.38%,5.00%,0.00%,2.00%,2.50% +66,Hermes-2-Pro-Llama-3-8B (FC),2.38%,4.50%,1.50%,2.00%,1.50% +67,Gemma-2-27b-it (Prompt),2.38%,4.50%,2.00%,1.50%,1.50% +68,Hammer2.1-0.5b (FC),2.25%,4.00%,0.50%,3.00%,1.50% +69,MiniCPM3-4B (Prompt),2.00%,3.00%,3.50%,1.00%,0.50% +70,Gemma-2-9b-it (Prompt),1.62%,2.00%,4.00%,0.50%,0.00% +71,Open-Mixtral-8x22b (FC),1.50%,3.50%,0.00%,1.00%,1.50% +72,Open-Mixtral-8x7b (Prompt),1.50%,2.50%,0.00%,1.50%,2.00% +73,Gemini-1.0-Pro-002 (Prompt),1.38%,2.50%,1.50%,0.50%,1.00% +74,Qwen2.5-1.5B-Instruct (Prompt),1.12%,1.50%,2.50%,0.50%,0.00% +75,Nexusflow-Raven-v2 (FC),1.00%,1.50%,0.50%,1.00%,1.00% +76,GoGoAgent,1.00%,1.50%,2.00%,0.50%,0.00% +77,Meta-Llama-3-8B-Instruct (Prompt),0.75%,1.50%,0.00%,1.00%,0.50% +78,Mistral-Small-2402 (Prompt),0.75%,0.50%,0.00%,1.50%,1.00% +79,Open-Mixtral-8x22b (Prompt),0.50%,1.00%,0.00%,0.00%,1.00% +80,Qwen2-1.5B-Instruct (Prompt),0.50%,0.50%,1.00%,0.00%,0.50% +81,Mistral-Medium-2312 (Prompt),0.38%,1.00%,0.00%,0.00%,0.50% +82,Open-Mistral-Nemo-2407 (Prompt),0.25%,0.50%,0.00%,0.00%,0.50% +83,DeepSeek-Coder-V2-Lite-Instruct (FC),0.12%,0.50%,0.00%,0.00%,0.00% +84,xLAM-1b-fc-r (FC),0.12%,0.50%,0.00%,0.00%,0.00% +85,DBRX-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% +86,Gemma-2-2b-it (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% +87,Llama-3.2-1B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% +88,xLAM-7b-fc-r (FC),0.00%,0.00%,0.00%,0.00%,0.00% +89,Qwen2.5-0.5B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% \ No newline at end of file diff --git a/data_non_live.csv b/data_non_live.csv index a05b71e1f..400c2b3fd 100644 --- a/data_non_live.csv +++ b/data_non_live.csv @@ -1,81 +1,90 @@ Rank,Model,Non_Live Overall Acc,AST Summary,Exec Summary,Simple AST,Python Simple AST,Java Simple AST,JavaScript Simple AST,Multiple AST,Parallel AST,Parallel Multiple AST,Simple Exec,Python Simple Exec,REST Simple Exec,Multiple Exec,Parallel Exec,Parallel Multiple Exec,Irrelevance Detection -1,Qwen2.5-72B-Instruct (Prompt),90.37%,90.85%,92.07%,80.92%,98.75%,62.00%,82.00%,97.50%,93.50%,91.50%,99.29%,100.00%,98.57%,94.00%,90.00%,85.00%,81.67% -2,Functionary-Medium-v3.1 (FC),89.77%,89.52%,91.32%,76.08%,96.25%,64.00%,68.00%,96.50%,94.50%,91.00%,99.29%,100.00%,98.57%,94.00%,92.00%,80.00%,84.58% -3,Gemini-1.5-Pro-002 (Prompt),88.83%,87.98%,91.27%,78.92%,94.75%,64.00%,78.00%,92.50%,91.50%,89.00%,98.57%,100.00%,97.14%,94.00%,90.00%,82.50%,82.50% -4,ToolACE-8B (FC),88.82%,87.29%,89.21%,76.67%,91.00%,65.00%,74.00%,94.00%,90.00%,88.50%,97.36%,99.00%,95.71%,94.00%,88.00%,77.50%,93.33% -5,GoGoAgent,88.42%,85.75%,89.86%,74.50%,92.50%,63.00%,68.00%,92.00%,89.50%,87.00%,95.43%,98.00%,92.86%,96.00%,88.00%,80.00%,93.33% -6,DeepSeek-Coder-V2 (FC),88.41%,89.15%,91.23%,78.08%,96.25%,64.00%,74.00%,95.00%,93.50%,90.00%,96.43%,100.00%,92.86%,94.00%,92.00%,82.50%,74.17% -7,Hammer2.0-7b (FC),88.31%,90.50%,88.62%,80.50%,97.50%,66.00%,78.00%,95.50%,94.00%,92.00%,89.50%,99.00%,80.00%,94.00%,86.00%,85.00%,78.33% -8,Llama-3.1-70B-Instruct (Prompt),87.77%,89.85%,90.12%,77.92%,95.75%,62.00%,76.00%,96.50%,94.00%,91.00%,94.00%,98.00%,90.00%,98.00%,86.00%,82.50%,70.00% -9,Gemma-2-27b-it (Prompt),87.16%,89.10%,89.09%,80.42%,94.25%,63.00%,84.00%,93.00%,91.00%,92.00%,87.86%,100.00%,75.71%,98.00%,88.00%,82.50%,71.67% -10,o1-preview-2024-09-12 (Prompt),86.62%,86.19%,88.70%,76.75%,92.25%,66.00%,72.00%,94.00%,90.00%,84.00%,99.29%,100.00%,98.57%,94.00%,84.00%,77.50%,80.00% -11,Qwen2.5-7B-Instruct (Prompt),86.01%,86.48%,88.29%,75.92%,95.75%,60.00%,72.00%,95.00%,91.00%,84.00%,92.14%,100.00%,84.29%,90.00%,86.00%,85.00%,75.00% -12,Gemini-1.5-Pro-001 (FC),85.52%,83.23%,87.95%,69.92%,92.75%,55.00%,62.00%,92.00%,90.50%,80.50%,91.79%,95.00%,88.57%,92.00%,88.00%,80.00%,85.00% -13,Functionary-Small-v3.1 (FC),85.44%,86.38%,87.12%,74.00%,96.00%,62.00%,64.00%,94.00%,90.00%,87.50%,89.50%,99.00%,80.00%,94.00%,90.00%,75.00%,75.00% -14,Gemini-1.5-Pro-001 (Prompt),85.16%,84.06%,85.77%,74.75%,93.25%,59.00%,72.00%,90.50%,91.00%,80.00%,91.57%,96.00%,87.14%,90.00%,84.00%,77.50%,87.08% -15,Gemini-1.5-Pro-002 (FC),85.06%,87.40%,84.61%,74.08%,94.25%,58.00%,70.00%,94.00%,92.00%,89.50%,75.93%,99.00%,52.86%,94.00%,86.00%,82.50%,77.50% -16,Gemma-2-9b-it (Prompt),85.02%,84.92%,87.52%,75.67%,93.00%,60.00%,74.00%,90.50%,88.00%,85.50%,88.07%,99.00%,77.14%,94.00%,88.00%,80.00%,75.42% -17,Granite-20b-FunctionCalling (FC),84.97%,82.21%,86.59%,72.83%,90.50%,66.00%,62.00%,91.00%,84.00%,81.00%,86.36%,97.00%,75.71%,92.00%,88.00%,80.00%,89.58% -18,GPT-4-turbo-2024-04-09 (FC),84.95%,84.56%,85.21%,69.75%,92.25%,59.00%,58.00%,91.00%,91.00%,86.50%,87.36%,99.00%,75.71%,90.00%,86.00%,77.50%,85.42% -19,Meta-Llama-3-70B-Instruct (Prompt),84.70%,87.77%,88.21%,76.58%,94.75%,61.00%,74.00%,94.50%,92.50%,87.50%,95.86%,96.00%,95.71%,94.00%,78.00%,85.00%,58.33% -20,GPT-4-turbo-2024-04-09 (Prompt),84.68%,90.98%,89.45%,81.92%,96.75%,67.00%,82.00%,95.50%,94.00%,92.50%,99.29%,100.00%,98.57%,96.00%,80.00%,82.50%,40.42% -21,Open-Mixtral-8x22b (Prompt),84.51%,87.90%,87.77%,78.58%,93.75%,60.00%,82.00%,94.00%,89.50%,89.50%,93.57%,100.00%,87.14%,96.00%,84.00%,77.50%,57.92% -22,xLAM-8x22b-r (FC),84.44%,83.58%,87.88%,77.33%,94.00%,64.00%,74.00%,93.50%,88.00%,75.50%,95.00%,100.00%,90.00%,94.00%,90.00%,72.50%,74.17% -23,FireFunction-v2 (FC),84.28%,87.10%,87.54%,79.92%,95.75%,64.00%,80.00%,93.00%,90.50%,85.00%,96.64%,99.00%,94.29%,92.00%,84.00%,77.50%,60.00% -24,GPT-4o-mini-2024-07-18 (Prompt),84.13%,86.69%,80.84%,79.25%,93.75%,66.00%,78.00%,90.50%,89.00%,88.00%,62.86%,100.00%,25.71%,96.00%,82.00%,82.50%,87.08% -25,Hammer2.0-1.5b (FC),83.74%,83.85%,87.70%,74.42%,94.25%,65.00%,64.00%,90.50%,87.50%,83.00%,92.79%,97.00%,88.57%,92.00%,86.00%,80.00%,67.50% -26,GPT-4o-mini-2024-07-18 (FC),83.49%,84.58%,83.57%,74.33%,91.00%,64.00%,68.00%,90.00%,90.00%,84.00%,83.29%,98.00%,68.57%,92.00%,84.00%,75.00%,78.75% -27,GPT-4o-2024-08-06 (FC),83.41%,86.38%,78.91%,75.00%,91.00%,64.00%,70.00%,92.50%,92.50%,85.50%,60.14%,96.00%,24.29%,92.00%,86.00%,77.50%,89.58% -28,Gemini-1.5-Flash-001 (Prompt),82.76%,85.44%,83.59%,70.75%,84.25%,64.00%,64.00%,90.00%,91.00%,90.00%,80.36%,85.00%,75.71%,92.00%,82.00%,80.00%,68.75% -29,o1-mini-2024-09-12 (Prompt),82.69%,80.54%,82.70%,70.67%,88.00%,62.00%,62.00%,89.50%,82.00%,80.00%,89.29%,100.00%,78.57%,86.00%,78.00%,77.50%,91.25% -30,MiniCPM3-4B-FC (FC),82.49%,81.06%,87.57%,69.75%,90.25%,59.00%,60.00%,92.00%,83.00%,79.50%,89.29%,100.00%,78.57%,90.00%,86.00%,85.00%,67.92% -31,claude-3.5-haiku-20241022 (Prompt),82.31%,82.98%,84.71%,76.92%,92.75%,64.00%,74.00%,93.50%,84.00%,77.50%,97.86%,100.00%,95.71%,90.00%,76.00%,75.00%,70.00% -32,Llama-3.1-8B-Instruct (Prompt),81.81%,84.02%,86.30%,72.58%,93.75%,58.00%,66.00%,93.50%,87.00%,83.00%,83.71%,96.00%,71.43%,96.00%,88.00%,77.50%,55.00% -33,mistral-large-2407 (FC),81.81%,86.98%,84.38%,74.42%,96.25%,61.00%,66.00%,93.00%,90.50%,90.00%,75.00%,100.00%,50.00%,94.00%,86.00%,82.50%,50.83% -34,GPT-4o-2024-08-06 (Prompt),80.78%,80.88%,77.66%,65.00%,88.00%,51.00%,56.00%,85.50%,92.00%,81.00%,61.14%,98.00%,24.29%,88.00%,84.00%,77.50%,92.92% -35,mistral-large-2407 (Prompt),80.65%,90.60%,90.12%,82.92%,96.75%,66.00%,86.00%,97.00%,92.00%,90.50%,100.00%,100.00%,100.00%,94.00%,84.00%,82.50%,2.92% -36,Gemini-1.5-Flash-002 (Prompt),80.29%,79.69%,80.64%,74.25%,94.75%,60.00%,68.00%,91.50%,86.00%,67.00%,93.57%,100.00%,87.14%,92.00%,82.00%,55.00%,81.25% -37,Claude-3-Opus-20240229 (Prompt),79.86%,85.02%,86.32%,79.08%,96.25%,65.00%,76.00%,95.00%,85.50%,80.50%,99.29%,100.00%,98.57%,90.00%,86.00%,70.00%,33.33% -38,Command-R-Plus (Prompt) (Original),79.64%,78.79%,84.68%,71.67%,90.00%,61.00%,64.00%,88.50%,82.00%,73.00%,93.21%,95.00%,91.43%,92.00%,76.00%,77.50%,62.92% -39,Llama-3.2-3B-Instruct (Prompt),79.46%,79.98%,83.70%,74.42%,92.25%,57.00%,74.00%,92.00%,79.50%,74.00%,87.29%,96.00%,78.57%,92.00%,78.00%,77.50%,60.42% -40,xLAM-7b-r (FC),78.92%,80.81%,79.88%,74.25%,90.75%,62.00%,70.00%,95.50%,80.50%,73.00%,74.00%,98.00%,50.00%,96.00%,82.00%,67.50%,67.50% -41,Gemini-1.5-Flash-002 (FC),78.91%,81.21%,73.21%,65.83%,86.50%,57.00%,54.00%,91.50%,80.00%,87.50%,68.86%,72.00%,65.71%,90.00%,54.00%,80.00%,92.50% -42,palmyra-x-004 (FC),78.82%,70.23%,87.54%,71.42%,96.25%,58.00%,60.00%,31.00%,90.50%,88.00%,97.14%,100.00%,94.29%,88.00%,80.00%,85.00%,78.33% -43,Open-Mistral-Nemo-2407 (FC),78.75%,82.44%,77.66%,64.75%,92.25%,34.00%,68.00%,93.00%,87.50%,84.50%,56.14%,98.00%,14.29%,94.00%,88.00%,72.50%,68.33% -44,Mistral-Medium-2312 (Prompt),78.58%,73.04%,81.57%,70.17%,91.50%,57.00%,62.00%,88.50%,68.50%,65.00%,93.29%,98.00%,88.57%,86.00%,72.00%,75.00%,88.75% -45,GPT-3.5-Turbo-0125 (FC),78.15%,83.81%,83.79%,74.25%,94.75%,62.00%,66.00%,93.00%,88.50%,79.50%,96.14%,98.00%,94.29%,88.00%,86.00%,65.00%,32.92% -46,Qwen2.5-1.5B-Instruct (Prompt),78.03%,73.60%,85.61%,70.92%,88.75%,54.00%,70.00%,86.50%,70.00%,67.00%,80.43%,98.00%,62.86%,94.00%,88.00%,80.00%,65.42% -47,Open-Mistral-Nemo-2407 (Prompt),78.00%,85.29%,89.07%,77.17%,92.50%,59.00%,80.00%,92.50%,87.00%,84.50%,93.79%,99.00%,88.57%,92.00%,88.00%,82.50%,4.58% -48,Command-R-Plus (FC) (Original),77.28%,78.58%,80.71%,68.83%,87.50%,61.00%,58.00%,91.50%,83.50%,70.50%,90.86%,96.00%,85.71%,90.00%,82.00%,60.00%,58.33% -49,Gemini-1.5-Flash-001 (FC),76.45%,77.42%,74.80%,65.17%,93.50%,56.00%,46.00%,94.50%,73.50%,76.50%,62.21%,93.00%,31.43%,88.00%,74.00%,75.00%,79.17% -50,Claude-3.5-Sonnet-20241022 (Prompt),75.78%,72.90%,80.00%,80.58%,93.75%,68.00%,80.00%,92.00%,73.00%,46.00%,100.00%,100.00%,100.00%,92.00%,68.00%,60.00%,70.42% -51,Hermes-2-Pro-Llama-3-8B (FC),74.37%,76.42%,76.23%,64.17%,90.50%,56.00%,46.00%,89.50%,79.50%,72.50%,70.43%,98.00%,42.86%,94.00%,78.00%,62.50%,58.75% -52,Qwen2-7B-Instruct (Prompt),72.71%,75.85%,76.80%,67.92%,83.75%,58.00%,62.00%,88.00%,74.00%,73.50%,80.21%,89.00%,71.43%,84.00%,78.00%,65.00%,43.75% -53,xLAM-8x7b-r (FC),71.03%,67.33%,74.05%,73.33%,91.00%,59.00%,70.00%,90.00%,68.50%,37.50%,89.21%,97.00%,81.43%,90.00%,72.00%,45.00%,73.75% -54,GPT-3.5-Turbo-0125 (Prompt),70.75%,72.75%,70.39%,77.50%,95.50%,61.00%,76.00%,92.50%,66.50%,54.50%,57.57%,98.00%,17.14%,90.00%,74.00%,60.00%,64.17% -55,Hermes-2-Pro-Mistral-7B (FC),68.94%,72.67%,76.00%,60.67%,86.00%,56.00%,40.00%,87.00%,78.50%,64.50%,61.00%,92.00%,30.00%,94.00%,84.00%,65.00%,25.83% -56,Hammer2.0-0.5b (FC),68.61%,67.19%,70.11%,63.25%,82.75%,53.00%,54.00%,80.50%,67.00%,58.00%,53.93%,95.00%,12.86%,84.00%,80.00%,62.50%,68.33% -57,Open-Mixtral-8x7b (Prompt),66.21%,63.33%,69.61%,64.83%,89.50%,51.00%,54.00%,86.00%,58.50%,44.00%,77.93%,93.00%,62.86%,86.00%,62.00%,52.50%,64.17% -58,xLAM-7b-fc-r (FC),63.62%,70.33%,60.63%,76.83%,93.50%,65.00%,72.00%,94.00%,72.00%,38.50%,84.50%,99.00%,70.00%,92.00%,56.00%,10.00%,48.75% -59,DBRX-Instruct (Prompt),62.36%,60.75%,69.14%,73.50%,92.50%,56.00%,72.00%,92.00%,40.00%,37.50%,90.07%,93.00%,87.14%,88.00%,46.00%,52.50%,41.67% -60,Gemini-1.0-Pro-002 (FC),61.84%,56.19%,64.93%,66.75%,94.25%,52.00%,54.00%,92.50%,39.00%,26.50%,87.21%,93.00%,81.43%,86.00%,64.00%,22.50%,72.08% -61,Claude-3-Opus-20240229 (FC),60.07%,55.58%,59.46%,67.83%,88.50%,59.00%,56.00%,89.50%,37.00%,28.00%,80.36%,95.00%,65.71%,88.00%,42.00%,27.50%,80.42% -62,MiniCPM3-4B (Prompt),59.24%,65.73%,50.59%,63.42%,84.25%,48.00%,58.00%,73.50%,63.00%,63.00%,40.36%,35.00%,45.71%,34.00%,48.00%,80.00%,67.92% -63,Mistral-small-2402 (FC),58.96%,57.77%,53.84%,67.58%,91.75%,59.00%,52.00%,94.00%,22.50%,47.00%,87.36%,99.00%,75.71%,92.00%,16.00%,20.00%,84.17% -64,Open-Mixtral-8x22b (FC),58.82%,61.42%,63.64%,71.67%,93.00%,66.00%,56.00%,94.00%,10.50%,69.50%,83.57%,100.00%,67.14%,94.00%,22.00%,55.00%,29.17% -65,Gemini-1.0-Pro-002 (Prompt),57.10%,58.40%,56.32%,47.58%,62.75%,26.00%,54.00%,60.50%,66.50%,59.00%,49.79%,61.00%,38.57%,68.00%,60.00%,47.50%,55.00% -66,Nexusflow-Raven-v2 (FC),55.71%,46.12%,59.11%,57.50%,37.50%,63.00%,72.00%,53.00%,34.50%,39.50%,47.93%,83.00%,12.86%,86.00%,40.00%,62.50%,80.42% -67,Meta-Llama-3-8B-Instruct (Prompt),54.23%,60.79%,58.93%,62.67%,87.00%,47.00%,54.00%,83.00%,49.00%,48.50%,47.71%,84.00%,11.43%,86.00%,42.00%,60.00%,9.17% -68,Claude-3-Haiku-20240307 (Prompt),54.22%,57.52%,55.62%,77.08%,96.25%,63.00%,72.00%,91.50%,38.50%,23.00%,94.00%,98.00%,90.00%,90.00%,6.00%,32.50%,35.42% -69,Claude-3.5-Sonnet-20241022 (FC),49.66%,45.92%,47.89%,77.67%,94.00%,65.00%,74.00%,95.00%,6.50%,4.50%,97.57%,98.00%,97.14%,90.00%,4.00%,0.00%,71.67% -70,Qwen2-1.5B-Instruct (Prompt),48.40%,54.52%,52.39%,51.08%,79.25%,38.00%,36.00%,78.00%,46.50%,42.50%,46.57%,76.00%,17.14%,76.00%,52.00%,35.00%,7.92% -71,FireFunction-v1 (FC),47.07%,42.90%,44.57%,80.08%,92.25%,66.00%,82.00%,91.50%,0.00%,0.00%,88.29%,98.00%,78.57%,90.00%,0.00%,0.00%,73.75% -72,GLM-4-9b-Chat (FC),46.55%,36.65%,46.00%,65.08%,86.25%,55.00%,54.00%,81.50%,0.00%,0.00%,94.00%,98.00%,90.00%,90.00%,0.00%,0.00%,88.33% -73,Llama-3.1-8B-Instruct (FC),43.78%,47.92%,50.18%,55.67%,51.00%,56.00%,60.00%,54.00%,47.00%,35.00%,58.71%,66.00%,51.43%,58.00%,54.00%,30.00%,1.67% -74,Claude-3-Haiku-20240307 (FC),42.95%,42.40%,48.41%,74.08%,95.25%,61.00%,66.00%,93.50%,2.00%,0.00%,91.64%,99.00%,84.29%,96.00%,6.00%,0.00%,23.33% -75,xLAM-1b-fc-r (FC),37.71%,40.96%,42.95%,71.83%,83.50%,62.00%,70.00%,85.00%,5.50%,1.50%,77.79%,97.00%,58.57%,90.00%,4.00%,0.00%,3.75% -76,Mistral-Small-2402 (Prompt),34.32%,27.06%,30.36%,23.25%,69.75%,0.00%,0.00%,74.00%,8.50%,2.50%,52.93%,43.00%,62.86%,64.00%,2.00%,2.50%,79.17% -77,Llama-3.1-70B-Instruct (FC),31.45%,25.08%,31.62%,48.83%,24.50%,58.00%,64.00%,24.50%,12.50%,14.50%,53.00%,36.00%,70.00%,36.00%,30.00%,7.50%,56.25% -78,Llama-3.2-1B-Instruct (Prompt),30.03%,27.60%,25.27%,29.42%,53.25%,13.00%,22.00%,33.50%,32.50%,15.00%,34.07%,61.00%,7.14%,28.00%,34.00%,5.00%,58.75% -79,DeepSeek-Coder-V2-Lite-Instruct (FC),27.69%,4.75%,33.18%,0.00%,0.00%,0.00%,0.00%,2.00%,3.50%,13.50%,17.71%,24.00%,11.43%,42.00%,28.00%,45.00%,97.50% -80,Gemma-2-2b-it (Prompt),23.23%,16.90%,19.12%,15.08%,35.25%,4.00%,6.00%,52.00%,0.00%,0.50%,22.50%,45.00%,0.00%,54.00%,0.00%,0.00%,65.00% \ No newline at end of file +1,Qwen2.5-72B-Instruct (Prompt),90.63%,90.81%,92.70%,80.25%,98.75%,62.00%,80.00%,97.50%,93.50%,92.00%,99.29%,100.00%,98.57%,94.00%,90.00%,87.50%,81.67% +2,Functionary-Medium-v3.1 (FC),89.93%,89.88%,91.32%,76.00%,96.00%,64.00%,68.00%,97.00%,95.00%,91.50%,99.29%,100.00%,98.57%,94.00%,92.00%,80.00%,84.58% +3,Gemini-1.5-Pro-002 (Prompt),89.10%,88.58%,91.27%,78.33%,95.00%,64.00%,76.00%,93.50%,92.50%,90.00%,98.57%,100.00%,97.14%,94.00%,90.00%,82.50%,82.50% +4,ToolACE-8B (FC),88.93%,87.54%,89.21%,76.67%,91.00%,65.00%,74.00%,93.50%,90.50%,89.50%,97.36%,99.00%,95.71%,94.00%,88.00%,77.50%,93.33% +5,gpt-4o-2024-11-20 (Prompt),88.79%,88.10%,89.38%,79.42%,96.25%,66.00%,76.00%,95.50%,94.00%,83.50%,100.00%,100.00%,100.00%,94.00%,86.00%,77.50%,89.17% +6,GoGoAgent,88.63%,86.23%,89.86%,75.42%,95.25%,63.00%,68.00%,93.00%,92.00%,84.50%,95.43%,98.00%,92.86%,96.00%,88.00%,80.00%,93.33% +7,DeepSeek-Coder-V2 (FC),88.54%,89.44%,91.23%,78.75%,96.25%,64.00%,76.00%,94.50%,93.50%,91.00%,96.43%,100.00%,92.86%,94.00%,92.00%,82.50%,74.17% +8,watt-tool-8B (FC),88.32%,86.56%,89.34%,76.75%,93.25%,63.00%,74.00%,95.00%,94.00%,80.50%,97.86%,100.00%,95.71%,94.00%,88.00%,77.50%,91.25% +9,gpt-4o-2024-11-20 (FC),88.08%,87.42%,89.20%,77.17%,91.50%,64.00%,76.00%,93.50%,93.00%,86.00%,88.29%,98.00%,78.57%,92.00%,94.00%,82.50%,86.25% +10,Llama-3.1-70B-Instruct (Prompt),87.82%,89.98%,90.12%,77.92%,95.75%,62.00%,76.00%,96.00%,94.50%,91.50%,94.00%,98.00%,90.00%,98.00%,86.00%,82.50%,70.00% +11,Gemma-2-27b-it (Prompt),87.09%,88.94%,89.09%,79.75%,94.25%,63.00%,82.00%,92.50%,91.50%,92.00%,87.86%,100.00%,75.71%,98.00%,88.00%,82.50%,71.67% +12,Qwen2.5-32B-Instruct (Prompt),87.03%,85.81%,89.79%,70.25%,96.75%,52.00%,62.00%,94.50%,90.50%,88.00%,96.64%,99.00%,94.29%,90.00%,90.00%,82.50%,80.83% +13,Hammer2.1-7b (FC),86.88%,88.65%,85.48%,78.08%,96.25%,66.00%,72.00%,95.00%,93.50%,88.00%,86.43%,100.00%,72.86%,92.00%,86.00%,77.50%,85.42% +14,Qwen2.5-14B-Instruct (Prompt),86.64%,85.69%,88.84%,73.25%,95.75%,56.00%,68.00%,92.50%,92.00%,85.00%,92.36%,99.00%,85.71%,90.00%,88.00%,85.00%,81.67% +15,watt-tool-70B (FC),86.44%,84.06%,89.39%,78.75%,98.25%,64.00%,74.00%,94.00%,85.50%,78.00%,98.57%,100.00%,97.14%,94.00%,90.00%,75.00%,84.17% +16,Gemini-1.5-Pro-001 (FC),86.01%,84.33%,87.95%,69.83%,92.50%,55.00%,62.00%,93.00%,92.00%,82.50%,91.79%,95.00%,88.57%,92.00%,88.00%,80.00%,85.00% +17,Qwen2.5-7B-Instruct (Prompt),86.00%,86.46%,88.29%,75.33%,96.00%,60.00%,70.00%,94.50%,91.50%,84.50%,92.14%,100.00%,84.29%,90.00%,86.00%,85.00%,75.00% +18,Gemini-1.5-Pro-001 (Prompt),85.82%,85.56%,85.77%,75.25%,93.75%,60.00%,72.00%,91.50%,91.50%,84.00%,91.57%,96.00%,87.14%,90.00%,84.00%,77.50%,87.08% +19,Hammer2.1-3b (FC),85.79%,86.85%,84.09%,81.42%,95.25%,67.00%,82.00%,95.00%,89.50%,81.50%,82.86%,100.00%,65.71%,92.00%,84.00%,77.50%,88.33% +20,Functionary-Small-v3.1 (FC),85.61%,86.75%,87.12%,74.00%,96.00%,62.00%,64.00%,94.50%,90.50%,88.00%,89.50%,99.00%,80.00%,94.00%,90.00%,75.00%,75.00% +21,Gemma-2-9b-it (Prompt),85.18%,85.29%,87.52%,75.67%,93.00%,60.00%,74.00%,90.50%,88.50%,86.50%,88.07%,99.00%,77.14%,94.00%,88.00%,80.00%,75.42% +22,GPT-4-turbo-2024-04-09 (FC),85.02%,84.73%,85.21%,70.42%,92.25%,59.00%,60.00%,91.00%,90.00%,87.50%,87.36%,99.00%,75.71%,90.00%,86.00%,77.50%,85.42% +23,Gemini-1.5-Pro-002 (FC),85.01%,87.29%,84.61%,73.17%,93.50%,58.00%,68.00%,95.00%,91.50%,89.50%,75.93%,99.00%,52.86%,94.00%,86.00%,82.50%,77.50% +24,Granite-20b-FunctionCalling (FC),84.89%,82.46%,86.36%,72.83%,90.50%,66.00%,62.00%,91.50%,84.00%,81.50%,84.93%,97.00%,72.86%,92.00%,86.00%,82.50%,88.75% +25,FireFunction-v2 (FC),84.89%,88.46%,87.54%,80.33%,96.00%,65.00%,80.00%,94.00%,91.50%,88.00%,96.64%,99.00%,94.29%,92.00%,84.00%,77.50%,60.00% +26,Meta-Llama-3-70B-Instruct (Prompt),84.72%,87.81%,88.21%,76.75%,95.25%,61.00%,74.00%,95.00%,92.50%,87.00%,95.86%,96.00%,95.71%,94.00%,78.00%,85.00%,58.33% +27,DeepSeek-V3 (FC),84.66%,89.17%,83.39%,78.67%,97.00%,65.00%,74.00%,95.50%,91.00%,91.50%,62.57%,98.00%,27.14%,94.00%,92.00%,85.00%,71.67% +28,Llama-3.3-70B-Instruct (Prompt),84.64%,85.08%,90.68%,74.83%,94.50%,60.00%,70.00%,94.50%,84.00%,87.00%,95.71%,100.00%,91.43%,98.00%,84.00%,85.00%,58.75% +29,GPT-4-turbo-2024-04-09 (Prompt),84.63%,90.88%,89.45%,82.50%,96.50%,67.00%,84.00%,95.50%,93.50%,92.00%,99.29%,100.00%,98.57%,96.00%,80.00%,82.50%,40.42% +30,Open-Mixtral-8x22b (Prompt),84.56%,88.02%,87.77%,78.58%,93.75%,60.00%,82.00%,94.00%,89.50%,90.00%,93.57%,100.00%,87.14%,96.00%,84.00%,77.50%,57.92% +31,xLAM-8x22b-r (FC),84.49%,83.69%,87.88%,77.75%,95.25%,64.00%,74.00%,94.50%,86.50%,76.00%,95.00%,100.00%,90.00%,94.00%,90.00%,72.50%,74.17% +32,GPT-4o-mini-2024-07-18 (Prompt),84.17%,86.77%,80.84%,80.08%,94.25%,66.00%,80.00%,90.50%,89.50%,87.00%,62.86%,100.00%,25.71%,96.00%,82.00%,82.50%,87.08% +33,GPT-4o-mini-2024-07-18 (FC),83.76%,85.21%,83.57%,74.83%,90.50%,64.00%,70.00%,92.00%,90.00%,84.00%,83.29%,98.00%,68.57%,92.00%,84.00%,75.00%,78.75% +34,o1-2024-12-17 (Prompt),83.57%,85.67%,79.77%,72.67%,92.00%,60.00%,66.00%,93.50%,91.50%,85.00%,58.57%,100.00%,17.14%,92.00%,86.00%,82.50%,90.42% +35,Hammer2.1-1.5b (FC),83.49%,82.79%,83.39%,74.67%,90.00%,64.00%,70.00%,92.00%,84.50%,80.00%,86.57%,96.00%,77.14%,90.00%,82.00%,75.00%,86.67% +36,Gemini-1.5-Flash-001 (Prompt),82.87%,85.69%,83.59%,70.75%,84.25%,64.00%,64.00%,90.00%,91.50%,90.50%,80.36%,85.00%,75.71%,92.00%,82.00%,80.00%,68.75% +37,claude-3.5-haiku-20241022 (Prompt),82.40%,83.19%,84.71%,76.25%,92.75%,64.00%,72.00%,93.00%,84.00%,79.50%,97.86%,100.00%,95.71%,90.00%,76.00%,75.00%,70.00% +38,MiniCPM3-4B-FC (FC),82.39%,80.83%,87.57%,69.83%,90.50%,59.00%,60.00%,91.50%,82.50%,79.50%,89.29%,100.00%,78.57%,90.00%,86.00%,85.00%,67.92% +39,Command R7B (FC),82.29%,81.67%,84.02%,68.17%,92.50%,56.00%,56.00%,91.50%,85.50%,81.50%,87.07%,97.00%,77.14%,92.00%,82.00%,75.00%,77.92% +40,o1-mini-2024-09-12 (Prompt),81.97%,78.92%,82.70%,71.17%,87.50%,62.00%,64.00%,89.00%,83.50%,72.00%,89.29%,100.00%,78.57%,86.00%,78.00%,77.50%,91.25% +41,Llama-3.1-8B-Instruct (Prompt),81.89%,84.21%,86.30%,72.83%,93.50%,59.00%,66.00%,93.50%,87.00%,83.50%,83.71%,96.00%,71.43%,96.00%,88.00%,77.50%,55.00% +42,mistral-large-2407 (FC),81.73%,86.81%,84.38%,74.25%,95.75%,61.00%,66.00%,92.50%,90.00%,90.50%,75.00%,100.00%,50.00%,94.00%,86.00%,82.50%,50.83% +43,Gemini-1.5-Flash-002 (Prompt),81.16%,81.65%,80.64%,73.58%,94.75%,60.00%,66.00%,91.50%,90.00%,71.50%,93.57%,100.00%,87.14%,92.00%,82.00%,55.00%,81.25% +44,mistral-large-2407 (Prompt),80.62%,90.54%,90.12%,82.17%,96.50%,66.00%,84.00%,97.00%,92.50%,90.50%,100.00%,100.00%,100.00%,94.00%,84.00%,82.50%,2.92% +45,Claude-3-Opus-20240229 (Prompt),79.99%,85.31%,86.32%,79.75%,96.25%,65.00%,78.00%,95.00%,85.50%,81.00%,99.29%,100.00%,98.57%,90.00%,86.00%,70.00%,33.33% +46,Llama-3.2-3B-Instruct (Prompt),79.72%,80.56%,83.70%,73.75%,92.25%,57.00%,72.00%,92.00%,80.50%,76.00%,87.29%,96.00%,78.57%,92.00%,78.00%,77.50%,60.42% +47,Qwen2.5-3B-Instruct (Prompt),79.22%,80.79%,81.71%,74.17%,91.50%,59.00%,72.00%,90.50%,79.50%,79.00%,80.86%,96.00%,65.71%,86.00%,80.00%,80.00%,62.92% +48,Gemini-1.5-Flash-002 (FC),79.15%,81.75%,73.21%,65.50%,87.50%,57.00%,52.00%,91.50%,80.50%,89.50%,68.86%,72.00%,65.71%,90.00%,54.00%,80.00%,92.50% +49,xLAM-7b-r (FC),79.03%,81.06%,79.88%,74.25%,90.75%,62.00%,70.00%,95.50%,81.00%,73.50%,74.00%,98.00%,50.00%,96.00%,82.00%,67.50%,67.50% +50,Ministral-8B-Instruct-2410 (FC),79.01%,83.83%,79.57%,71.83%,93.50%,60.00%,62.00%,91.50%,84.50%,87.50%,71.29%,94.00%,48.57%,86.00%,86.00%,75.00%,57.50% +51,Mistral-Medium-2312 (Prompt),78.62%,73.12%,81.57%,69.50%,91.50%,57.00%,60.00%,88.50%,69.00%,65.50%,93.29%,98.00%,88.57%,86.00%,72.00%,75.00%,88.75% +52,Open-Mistral-Nemo-2407 (FC),78.60%,82.10%,77.66%,64.42%,91.25%,34.00%,68.00%,93.50%,85.50%,85.00%,56.14%,98.00%,14.29%,94.00%,88.00%,72.50%,68.33% +53,Open-Mistral-Nemo-2407 (Prompt),78.37%,86.12%,89.07%,77.00%,92.00%,59.00%,80.00%,93.50%,89.50%,84.50%,93.79%,99.00%,88.57%,92.00%,88.00%,82.50%,4.58% +54,GPT-3.5-Turbo-0125 (FC),78.20%,83.94%,83.79%,74.25%,94.75%,62.00%,66.00%,93.50%,89.00%,79.00%,96.14%,98.00%,94.29%,88.00%,86.00%,65.00%,32.92% +55,Qwen2.5-1.5B-Instruct (Prompt),77.93%,73.37%,85.61%,71.00%,89.00%,54.00%,70.00%,86.00%,70.00%,66.50%,80.43%,98.00%,62.86%,94.00%,88.00%,80.00%,65.42% +56,Gemini-1.5-Flash-001 (FC),76.51%,77.54%,74.80%,65.17%,93.50%,56.00%,46.00%,94.50%,73.00%,77.50%,62.21%,93.00%,31.43%,88.00%,74.00%,75.00%,79.17% +57,Command-R-Plus (FC),75.93%,77.02%,81.21%,72.08%,87.25%,59.00%,70.00%,89.50%,82.50%,64.00%,90.86%,96.00%,85.71%,90.00%,84.00%,60.00%,50.42% +58,Claude-3.5-Sonnet-20241022 (Prompt),75.59%,72.48%,80.00%,81.42%,94.25%,68.00%,82.00%,92.00%,70.50%,46.00%,100.00%,100.00%,100.00%,92.00%,68.00%,60.00%,70.42% +59,Hermes-2-Pro-Llama-3-8B (FC),74.54%,76.79%,76.23%,64.17%,90.50%,56.00%,46.00%,89.50%,80.00%,73.50%,70.43%,98.00%,42.86%,94.00%,78.00%,62.50%,58.75% +60,Qwen2-7B-Instruct (Prompt),73.06%,76.65%,76.80%,68.08%,84.25%,58.00%,62.00%,88.00%,75.50%,75.00%,80.21%,89.00%,71.43%,84.00%,78.00%,65.00%,43.75% +61,xLAM-8x7b-r (FC),71.17%,67.65%,74.05%,73.58%,91.75%,59.00%,70.00%,90.00%,69.00%,38.00%,89.21%,97.00%,81.43%,90.00%,72.00%,45.00%,73.75% +62,GPT-3.5-Turbo-0125 (Prompt),70.79%,72.85%,70.39%,77.92%,96.75%,61.00%,76.00%,93.50%,67.00%,53.00%,57.57%,98.00%,17.14%,90.00%,74.00%,60.00%,64.17% +63,Hammer2.1-0.5b (FC),70.70%,69.12%,70.46%,68.00%,84.00%,62.00%,58.00%,83.00%,71.50%,54.00%,68.36%,91.00%,45.71%,84.00%,82.00%,47.50%,77.92% +64,Hermes-2-Pro-Mistral-7B (FC),69.12%,73.06%,76.00%,60.75%,86.25%,56.00%,40.00%,87.50%,78.50%,65.50%,61.00%,92.00%,30.00%,94.00%,84.00%,65.00%,25.83% +65,Open-Mixtral-8x7b (Prompt),66.33%,63.58%,69.61%,64.83%,89.50%,51.00%,54.00%,86.00%,59.00%,44.50%,77.93%,93.00%,62.86%,86.00%,62.00%,52.50%,64.17% +66,xLAM-7b-fc-r (FC),64.40%,72.08%,60.63%,76.83%,93.50%,65.00%,72.00%,93.50%,77.00%,41.00%,84.50%,99.00%,70.00%,92.00%,56.00%,10.00%,48.75% +67,DBRX-Instruct (Prompt),62.58%,61.25%,69.14%,73.50%,92.50%,56.00%,72.00%,92.00%,42.50%,37.00%,90.07%,93.00%,87.14%,88.00%,46.00%,52.50%,41.67% +68,Gemini-1.0-Pro-002 (FC),62.04%,56.65%,64.93%,66.58%,93.75%,52.00%,54.00%,95.00%,40.00%,25.00%,87.21%,93.00%,81.43%,86.00%,64.00%,22.50%,72.08% +69,Claude-3-Opus-20240229 (FC),61.10%,57.92%,59.46%,67.17%,88.50%,59.00%,54.00%,93.00%,39.50%,32.00%,80.36%,95.00%,65.71%,88.00%,42.00%,27.50%,80.42% +70,Mistral-small-2402 (FC),59.57%,59.15%,53.84%,67.58%,91.75%,59.00%,52.00%,94.00%,24.50%,50.50%,87.36%,99.00%,75.71%,92.00%,16.00%,20.00%,84.17% +71,MiniCPM3-4B (Prompt),59.31%,65.88%,50.59%,63.50%,84.50%,48.00%,58.00%,72.50%,65.50%,62.00%,40.36%,35.00%,45.71%,34.00%,48.00%,80.00%,67.92% +72,Open-Mixtral-8x22b (FC),58.93%,61.67%,63.64%,71.67%,93.00%,66.00%,56.00%,94.00%,10.50%,70.50%,83.57%,100.00%,67.14%,94.00%,22.00%,55.00%,29.17% +73,Gemini-1.0-Pro-002 (Prompt),56.62%,57.31%,56.32%,46.25%,58.75%,26.00%,54.00%,56.50%,63.50%,63.00%,49.79%,61.00%,38.57%,68.00%,60.00%,47.50%,55.00% +74,Nexusflow-Raven-v2 (FC),55.59%,45.88%,59.11%,57.50%,37.50%,63.00%,72.00%,53.00%,34.00%,39.00%,47.93%,83.00%,12.86%,86.00%,40.00%,62.50%,80.42% +75,Meta-Llama-3-8B-Instruct (Prompt),54.23%,60.79%,58.93%,62.67%,87.00%,47.00%,54.00%,82.50%,48.00%,50.00%,47.71%,84.00%,11.43%,86.00%,42.00%,60.00%,9.17% +76,Qwen2.5-0.5B-Instruct (Prompt),52.58%,53.19%,61.89%,58.25%,76.75%,44.00%,54.00%,68.00%,53.50%,33.00%,63.07%,89.00%,37.14%,70.00%,62.00%,52.50%,12.92% +77,Claude-3.5-Sonnet-20241022 (FC),49.44%,45.44%,47.89%,78.75%,95.25%,65.00%,76.00%,94.50%,3.50%,5.00%,97.57%,98.00%,97.14%,90.00%,4.00%,0.00%,71.67% +78,Qwen2-1.5B-Instruct (Prompt),48.29%,54.29%,52.39%,51.17%,79.50%,38.00%,36.00%,79.00%,46.50%,40.50%,46.57%,76.00%,17.14%,76.00%,52.00%,35.00%,7.92% +79,claude-3.5-haiku-20241022 (FC),47.43%,40.62%,50.46%,68.00%,96.00%,56.00%,52.00%,92.00%,2.50%,0.00%,87.86%,100.00%,75.71%,90.00%,24.00%,0.00%,62.50% +80,FireFunction-v1 (FC),47.12%,43.00%,44.57%,80.00%,92.00%,66.00%,82.00%,92.00%,0.00%,0.00%,88.29%,98.00%,78.57%,90.00%,0.00%,0.00%,73.75% +81,GLM-4-9b-Chat (FC),46.56%,36.67%,46.00%,65.17%,86.50%,55.00%,54.00%,81.50%,0.00%,0.00%,94.00%,98.00%,90.00%,90.00%,0.00%,0.00%,88.33% +82,o1-2024-12-17 (FC),44.46%,40.23%,38.66%,67.92%,93.75%,56.00%,54.00%,93.00%,0.00%,0.00%,60.64%,97.00%,24.29%,94.00%,0.00%,0.00%,84.58% +83,Llama-3.1-8B-Instruct (FC),43.91%,48.21%,50.18%,55.83%,50.50%,57.00%,60.00%,54.00%,48.50%,34.50%,58.71%,66.00%,51.43%,58.00%,54.00%,30.00%,1.67% +84,xLAM-1b-fc-r (FC),37.80%,41.17%,42.95%,71.67%,83.00%,62.00%,70.00%,86.00%,5.00%,2.00%,77.79%,97.00%,58.57%,90.00%,4.00%,0.00%,3.75% +85,Mistral-Small-2402 (Prompt),34.26%,26.94%,30.36%,23.25%,69.75%,0.00%,0.00%,74.00%,8.50%,2.00%,52.93%,43.00%,62.86%,64.00%,2.00%,2.50%,79.17% +86,Llama-3.1-70B-Instruct (FC),31.55%,25.29%,31.62%,49.17%,24.50%,59.00%,64.00%,24.50%,12.50%,15.00%,53.00%,36.00%,70.00%,36.00%,30.00%,7.50%,56.25% +87,Llama-3.2-1B-Instruct (Prompt),30.40%,28.44%,25.27%,29.25%,52.75%,13.00%,22.00%,33.50%,36.00%,15.00%,34.07%,61.00%,7.14%,28.00%,34.00%,5.00%,58.75% +88,DeepSeek-Coder-V2-Lite-Instruct (FC),27.75%,4.88%,33.18%,0.00%,0.00%,0.00%,0.00%,1.50%,3.50%,14.50%,17.71%,24.00%,11.43%,42.00%,28.00%,45.00%,97.50% +89,Gemma-2-2b-it (Prompt),23.32%,17.10%,19.12%,15.42%,36.25%,4.00%,6.00%,52.00%,0.00%,1.00%,22.50%,45.00%,0.00%,54.00%,0.00%,0.00%,65.00% \ No newline at end of file diff --git a/data_overall.csv b/data_overall.csv index 4326ddd7d..3c56a5c90 100644 --- a/data_overall.csv +++ b/data_overall.csv @@ -1,81 +1,90 @@ Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s),Non-Live AST Acc,Non-Live Simple AST,Non-Live Multiple AST,Non-Live Parallel AST,Non-Live Parallel Multiple AST,Non-Live Exec Acc,Non-Live Simple Exec,Non-Live Multiple Exec,Non-Live Parallel Exec,Non-Live Parallel Multiple Exec,Live Acc,Live Simple AST,Live Multiple AST,Live Parallel AST,Live Parallel Multiple AST,Multi Turn Acc,Multi Turn Base,Multi Turn Miss Func,Multi Turn Miss Param,Multi Turn Long Context,Relevance Detection,Irrelevance Detection,Organization,License -1,67.54%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,33.22,2.52,6.3,5.13,84.56%,69.75%,91.00%,91.00%,86.50%,85.21%,87.36%,90.00%,86.00%,77.50%,79.56%,81.01%,77.59%,81.25%,66.67%,38.12%,54.00%,13.50%,35.50%,49.50%,70.59%,83.69%,OpenAI,Proprietary -2,67.28%,GPT-4o-2024-08-06 (FC),https://openai.com/index/hello-gpt-4o/,8.22,1.77,6.71,3.99,86.38%,75.00%,92.50%,92.50%,85.50%,78.91%,60.14%,92.00%,86.00%,77.50%,79.29%,76.36%,76.07%,81.25%,66.67%,39.12%,58.00%,10.00%,37.00%,51.50%,70.59%,87.03%,OpenAI,Proprietary -3,66.29%,GPT-4o-2024-08-06 (Prompt),https://openai.com/index/hello-gpt-4o/,12.8,1.45,9.21,2.57,80.88%,65.00%,85.50%,92.00%,81.00%,77.66%,61.14%,88.00%,84.00%,77.50%,80.84%,78.68%,72.46%,100.00%,75.00%,37.25%,44.00%,31.50%,29.50%,44.00%,52.94%,92.38%,OpenAI,Proprietary -4,66.26%,o1-preview-2024-09-12 (Prompt),https://openai.com/index/introducing-openai-o1-preview/,203.92,26.55,16.66,56.16,86.19%,76.75%,94.00%,90.00%,84.00%,88.70%,99.29%,94.00%,84.00%,77.50%,75.29%,82.17%,76.35%,81.25%,79.17%,36.88%,47.50%,38.50%,31.50%,30.00%,88.24%,75.77%,OpenAI,Proprietary -5,63.62%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.51,1.55,9.91,3.08,84.58%,74.33%,90.00%,90.00%,84.00%,83.57%,83.29%,92.00%,84.00%,75.00%,73.24%,75.19%,75.12%,87.50%,70.83%,34.12%,47.50%,19.50%,29.00%,40.50%,82.35%,74.41%,OpenAI,Proprietary -6,62.89%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,29.79,8.28,10.05,17.4,80.54%,70.67%,89.50%,82.00%,80.00%,82.70%,89.29%,86.00%,78.00%,77.50%,77.73%,72.87%,71.70%,75.00%,66.67%,28.25%,40.50%,5.00%,34.50%,33.00%,58.82%,89.16%,OpenAI,Proprietary -7,62.53%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,12.14,55.42,33.2,89.52%,76.08%,96.50%,94.50%,91.00%,91.32%,99.29%,94.00%,92.00%,80.00%,76.45%,81.78%,82.62%,68.75%,75.00%,21.38%,31.50%,21.00%,26.50%,6.50%,72.22%,76.19%,MeetKai,MIT -8,62.11%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,7.05,5.57,33.74,4.69,87.98%,78.92%,92.50%,91.50%,89.00%,91.27%,98.57%,94.00%,90.00%,82.50%,76.76%,81.01%,77.97%,93.75%,70.83%,20.75%,23.00%,19.50%,17.50%,23.00%,76.47%,78.21%,Google,Proprietary -9,61.11%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.84,1.34,6.87,2.39,86.69%,79.25%,90.50%,89.00%,88.00%,80.84%,62.86%,96.00%,82.00%,82.50%,77.20%,79.84%,76.73%,93.75%,70.83%,22.00%,33.00%,12.00%,17.00%,26.00%,82.35%,81.92%,OpenAI,Proprietary -10,61.04%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,5.39,2.17,2.59,4.12,87.40%,74.08%,94.00%,92.00%,89.50%,84.61%,75.93%,94.00%,86.00%,82.50%,76.44%,79.07%,75.50%,87.50%,75.00%,21.62%,31.00%,5.00%,21.00%,29.50%,76.47%,77.07%,Google,Proprietary -11,60.89%,Qwen2.5-72B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,N/A,4.24,8.69,10.86,90.85%,80.92%,97.50%,93.50%,91.50%,92.07%,99.29%,94.00%,90.00%,85.00%,75.03%,84.11%,81.67%,62.50%,75.00%,17.25%,23.50%,20.00%,13.50%,12.00%,94.44%,72.98%,Qwen,apache-2.0 -12,60.17%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,7.0,1.54,4.69,2.39,84.06%,74.75%,90.50%,91.00%,80.00%,85.77%,91.57%,90.00%,84.00%,77.50%,76.49%,75.58%,71.98%,93.75%,75.00%,18.88%,26.00%,5.00%,21.50%,23.00%,52.94%,84.70%,Google,Proprietary -13,59.50%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,58.87,1.29,1.36,2.7,90.98%,81.92%,95.50%,94.00%,92.50%,89.45%,99.29%,96.00%,80.00%,82.50%,63.56%,86.05%,84.24%,100.00%,79.17%,30.25%,42.50%,25.00%,20.50%,33.00%,100.00%,35.46%,OpenAI,Proprietary -14,59.00%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,5.1,1.44,1.85,2.49,83.23%,69.92%,92.00%,90.50%,80.50%,87.95%,91.79%,92.00%,88.00%,80.00%,75.47%,73.26%,70.18%,81.25%,58.33%,16.00%,24.50%,3.00%,15.50%,21.00%,58.82%,84.05%,Google,Proprietary -15,58.43%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,12.68,3.12,10.75,6.19,86.98%,74.42%,93.00%,90.50%,90.00%,84.38%,75.00%,94.00%,86.00%,82.50%,69.73%,85.66%,78.16%,68.75%,75.00%,23.75%,33.50%,18.00%,23.50%,20.00%,76.47%,52.80%,Mistral AI,Proprietary -16,58.31%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,4.6,15.67,9.25,87.29%,76.67%,94.00%,90.00%,88.50%,89.21%,97.36%,94.00%,88.00%,77.50%,78.37%,72.48%,76.54%,81.25%,70.83%,7.75%,7.50%,11.50%,5.00%,7.00%,77.78%,87.88%,Huawei Noah & USTC,Apache-2.0 -17,58.03%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,7.86,10.86,17.45,83.58%,77.33%,93.50%,88.00%,75.50%,87.88%,95.00%,94.00%,90.00%,72.50%,73.39%,83.33%,80.15%,62.50%,75.00%,16.25%,25.50%,16.00%,11.50%,12.00%,88.89%,68.21%,Salesforce,cc-by-nc-4.0 -18,56.93%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.48,0.7,1.04,1.08,85.44%,70.75%,90.00%,91.00%,90.00%,83.59%,80.36%,92.00%,82.00%,80.00%,68.53%,74.81%,75.78%,93.75%,79.17%,19.50%,27.50%,20.00%,12.00%,18.50%,82.35%,62.89%,Google,Proprietary -19,56.52%,Claude-3-Opus-20240229 (FC),https://www.anthropic.com/news/claude-3-family,20.16,9.46,9.98,17.43,55.58%,67.83%,89.50%,37.00%,28.00%,59.46%,80.36%,88.00%,42.00%,27.50%,79.24%,79.84%,77.40%,18.75%,29.17%,30.25%,41.50%,14.00%,33.50%,32.00%,76.47%,82.10%,Anthropic,Proprietary -20,56.20%,Claude-3.5-Sonnet-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,2.54,3.07,5.61,4.72,45.92%,77.67%,95.00%,6.50%,4.50%,47.89%,97.57%,90.00%,4.00%,0.00%,77.96%,82.17%,81.10%,31.25%,12.50%,41.00%,55.00%,19.00%,42.50%,47.50%,70.59%,73.70%,Anthropic,Proprietary -21,56.16%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.3,0.74,1.05,1.3,81.21%,65.83%,91.50%,80.00%,87.50%,73.21%,68.86%,90.00%,54.00%,80.00%,77.96%,71.71%,70.47%,81.25%,75.00%,11.62%,19.00%,0.50%,10.50%,16.50%,58.82%,90.81%,Google,Proprietary -22,56.10%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,35.22,60.13,142.21,86.38%,74.00%,94.00%,90.00%,87.50%,87.12%,89.50%,94.00%,90.00%,75.00%,72.99%,78.68%,77.49%,75.00%,58.33%,9.88%,17.00%,2.50%,14.00%,6.00%,83.33%,70.55%,MeetKai,MIT -23,56.00%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.46,0.8,1.07,1.38,79.69%,74.25%,91.50%,86.00%,67.00%,80.64%,93.57%,92.00%,82.00%,55.00%,75.20%,77.13%,74.26%,93.75%,58.33%,12.50%,17.50%,6.00%,11.50%,15.00%,88.24%,78.44%,Google,Proprietary -24,55.78%,palmyra-x-004 (FC),https://writer.com/engineering/actions-with-palmyra-x-004/,24.94,2.76,10.69,5.48,70.23%,71.42%,31.00%,90.50%,88.00%,87.54%,97.14%,88.00%,80.00%,85.00%,77.16%,75.19%,75.21%,50.00%,62.50%,11.37%,12.00%,2.50%,18.50%,12.50%,70.59%,79.70%,Writer,Proprietary -25,55.45%,DeepSeek-Coder-V2 (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct-0724,N/A,27.82,108.59,55.51,89.15%,78.08%,95.00%,93.50%,90.00%,91.23%,96.43%,94.00%,92.00%,82.50%,73.43%,80.62%,77.30%,50.00%,70.83%,4.50%,7.50%,3.00%,4.00%,3.50%,83.33%,70.59%,DeepSeek,DeepSeek License -26,55.28%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.31,0.58,0.69,0.84,77.42%,65.17%,94.50%,73.50%,76.50%,74.80%,62.21%,88.00%,74.00%,75.00%,75.51%,72.09%,73.31%,62.50%,58.33%,13.87%,19.00%,3.50%,14.00%,19.00%,58.82%,79.66%,Google,Proprietary -27,55.19%,Hammer2.0-7b (FC),https://huggingface.co/MadeAgents/Hammer2.0-7b,N/A,7.27,19.18,23.97,90.50%,80.50%,95.50%,94.00%,92.00%,88.62%,89.50%,94.00%,86.00%,85.00%,71.75%,75.97%,77.59%,81.25%,75.00%,5.50%,9.00%,2.00%,7.00%,4.00%,94.44%,70.57%,MadeAgents,cc-by-nc-4.0 -28,54.75%,GoGoAgent,https://gogoagent.ai,N/A,2.18,1.49,4.89,85.75%,74.50%,92.00%,89.50%,87.00%,89.86%,95.43%,96.00%,88.00%,80.00%,74.84%,74.81%,72.08%,81.25%,66.67%,1.00%,1.50%,2.00%,0.50%,0.00%,94.12%,85.61%,BitAgent,Proprietary -29,54.10%,claude-3.5-haiku-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,0.48,1.87,1.53,3.63,82.98%,76.92%,93.50%,84.00%,77.50%,84.71%,97.86%,90.00%,76.00%,75.00%,70.24%,81.01%,73.98%,87.50%,58.33%,9.75%,16.00%,0.50%,8.00%,14.50%,77.78%,66.24%,Anthropic,Proprietary -30,54.05%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,4.46,14.68,14.06,89.85%,77.92%,96.50%,94.00%,91.00%,90.12%,94.00%,98.00%,86.00%,82.50%,62.02%,77.52%,75.97%,87.50%,62.50%,12.38%,16.50%,13.00%,10.50%,9.50%,94.44%,54.84%,Meta,Meta Llama 3 Community -31,53.54%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.39,0.88,0.94,1.48,83.81%,74.25%,93.00%,88.50%,79.50%,83.79%,96.14%,88.00%,86.00%,65.00%,62.98%,77.91%,78.35%,50.00%,54.17%,19.50%,32.50%,11.50%,21.50%,12.50%,94.12%,36.53%,OpenAI,Proprietary -32,53.53%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,4.03,11.55,9.68,86.48%,75.92%,95.00%,91.00%,84.00%,88.29%,92.14%,90.00%,86.00%,85.00%,66.95%,74.81%,74.45%,62.50%,66.67%,7.62%,9.50%,8.50%,7.00%,5.50%,83.33%,65.22%,Qwen,apache-2.0 -33,53.12%,FireFunction-v2 (FC),https://huggingface.co/fireworks-ai/firefunction-v2,N/A,2.13,1.19,3.93,87.10%,79.92%,93.00%,90.50%,85.00%,87.54%,96.64%,92.00%,84.00%,77.50%,66.44%,76.74%,75.50%,56.25%,58.33%,8.62%,13.50%,7.00%,11.00%,3.00%,88.24%,56.30%,Fireworks,Apache 2.0 -34,52.75%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,12.8,25.61,24.81,80.81%,74.25%,95.50%,80.50%,73.00%,79.88%,74.00%,96.00%,82.00%,67.50%,69.35%,71.32%,74.45%,50.00%,62.50%,10.00%,16.50%,8.50%,7.50%,7.50%,94.44%,65.10%,Salesforce,cc-by-nc-4.0 -35,52.54%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,4.5,4.66,9.49,67.33%,73.33%,90.00%,68.50%,37.50%,74.05%,89.21%,90.00%,72.00%,45.00%,71.08%,72.48%,78.16%,62.50%,66.67%,15.50%,26.00%,13.00%,11.50%,11.50%,94.44%,67.88%,Salesforce,cc-by-nc-4.0 -36,52.05%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,11.07,4.91,17.8,9.97,73.04%,70.17%,88.50%,68.50%,65.00%,81.57%,93.29%,86.00%,72.00%,75.00%,77.20%,74.03%,73.69%,81.25%,54.17%,0.38%,1.00%,0.00%,0.00%,0.50%,64.71%,85.93%,Mistral AI,Proprietary -37,51.89%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,2.31,2.61,6.8,89.10%,80.42%,93.00%,91.00%,92.00%,89.09%,87.86%,98.00%,88.00%,82.50%,66.15%,83.33%,78.06%,68.75%,58.33%,2.38%,4.50%,2.00%,1.50%,1.50%,88.89%,59.13%,Google,gemma-terms-of-use -38,51.79%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,2.88,3.13,8.77,87.77%,76.58%,94.50%,92.50%,87.50%,88.21%,95.86%,94.00%,78.00%,85.00%,65.04%,81.01%,78.54%,75.00%,70.83%,5.62%,10.00%,4.00%,6.00%,2.50%,94.44%,50.82%,Meta,Meta Llama 3 Community -39,51.74%,Claude-3.5-Sonnet-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,1.23,1.78,1.33,3.22,72.90%,80.58%,92.00%,73.00%,46.00%,80.00%,100.00%,92.00%,68.00%,60.00%,71.96%,86.05%,80.44%,81.25%,45.83%,7.50%,9.00%,5.50%,5.00%,10.50%,76.47%,64.29%,Anthropic,Proprietary -40,51.59%,MiniCPM3-4B-FC (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,159.07,185.16,464.0,81.06%,69.75%,92.00%,83.00%,79.50%,87.57%,89.29%,90.00%,86.00%,85.00%,69.66%,72.87%,63.63%,37.50%,62.50%,2.62%,5.00%,1.00%,3.00%,1.50%,77.78%,72.22%,openbmb,Apache-2.0 -41,51.42%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,5.36,8.6,13.34,84.92%,75.67%,90.50%,88.00%,85.50%,87.52%,88.07%,94.00%,88.00%,80.00%,67.61%,73.26%,74.26%,56.25%,66.67%,1.62%,2.00%,4.00%,0.50%,0.00%,77.78%,66.73%,Google,gemma-terms-of-use -42,51.26%,Claude-3-Opus-20240229 (Prompt),https://www.anthropic.com/news/claude-3-family,10.48,4.41,8.27,10.61,85.02%,79.08%,95.00%,85.50%,80.50%,86.32%,99.29%,90.00%,86.00%,70.00%,66.80%,84.11%,78.73%,75.00%,54.17%,7.13%,11.50%,2.50%,6.00%,8.50%,82.35%,40.36%,Anthropic,Proprietary -43,51.01%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.18,1.75,1.94,3.61,82.44%,64.75%,93.00%,87.50%,84.50%,77.66%,56.14%,94.00%,88.00%,72.50%,65.16%,75.19%,68.28%,75.00%,70.83%,9.12%,15.00%,3.50%,9.00%,9.00%,64.71%,63.25%,Mistral AI,Proprietary -44,50.58%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,10.59,52.72,24.5,84.02%,72.58%,93.50%,87.00%,83.00%,86.30%,83.71%,96.00%,88.00%,77.50%,60.68%,73.26%,72.36%,56.25%,50.00%,9.25%,12.00%,10.00%,7.00%,8.00%,72.22%,49.10%,Meta,Meta Llama 3 Community -45,50.28%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,12.83,1.39,6.27,3.15,87.90%,78.58%,94.00%,89.50%,89.50%,87.77%,93.57%,96.00%,84.00%,77.50%,65.82%,80.62%,72.84%,81.25%,75.00%,0.50%,1.00%,0.00%,0.00%,1.00%,82.35%,55.09%,Mistral AI,Proprietary -46,50.12%,Hammer2.0-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-1.5b,N/A,3.66,6.19,8.01,83.85%,74.42%,90.50%,87.50%,83.00%,87.70%,92.79%,92.00%,86.00%,80.00%,64.86%,74.03%,68.47%,56.25%,70.83%,1.75%,2.00%,1.00%,1.50%,2.50%,83.33%,62.49%,MadeAgents,cc-by-nc-4.0 -47,49.43%,Command-R-Plus (Prompt) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,15.82,1.21,0.78,2.08,78.79%,71.67%,88.50%,82.00%,73.00%,84.68%,93.21%,92.00%,76.00%,77.50%,68.27%,75.58%,76.26%,81.25%,70.83%,0.38%,1.00%,0.00%,0.00%,0.50%,82.35%,59.46%,Cohere For AI,cc-by-nc-4.0 -48,49.19%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,1.97,1.94,5.3,82.21%,72.83%,91.00%,84.00%,81.00%,86.59%,86.36%,92.00%,88.00%,80.00%,59.22%,67.44%,55.56%,43.75%,54.17%,3.38%,6.00%,1.50%,4.50%,1.50%,88.89%,75.29%,IBM,Apache-2.0 -49,48.33%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.16,0.74,1.79,1.21,72.75%,77.50%,92.50%,66.50%,54.50%,70.39%,57.57%,90.00%,74.00%,60.00%,68.62%,78.29%,78.25%,75.00%,62.50%,5.62%,9.00%,2.00%,7.00%,4.50%,94.12%,59.01%,OpenAI,Proprietary -50,47.22%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,24.91,3.39,3.96,6.93,90.60%,82.92%,97.00%,92.00%,90.50%,90.12%,100.00%,94.00%,84.00%,82.50%,52.62%,86.05%,81.96%,93.75%,79.17%,8.38%,15.00%,6.00%,6.00%,6.50%,100.00%,4.18%,Mistral AI,Proprietary -51,47.11%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,4.14,3.98,8.7,76.42%,64.17%,89.50%,79.50%,72.50%,76.23%,70.43%,94.00%,78.00%,62.50%,64.59%,69.77%,65.53%,56.25%,50.00%,2.38%,4.50%,1.50%,2.00%,1.50%,44.44%,60.84%,NousResearch,apache-2.0 -52,46.75%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,7.85,45.67,14.72,79.98%,74.42%,92.00%,79.50%,74.00%,83.70%,87.29%,92.00%,78.00%,77.50%,55.53%,63.18%,64.39%,18.75%,45.83%,5.25%,8.50%,2.50%,4.50%,5.50%,83.33%,51.75%,Meta,Meta Llama 3 Community -53,46.54%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,2.55,6.34,4.62,73.60%,70.92%,86.50%,70.00%,67.00%,85.61%,80.43%,94.00%,88.00%,80.00%,60.46%,68.60%,58.50%,56.25%,50.00%,1.12%,1.50%,2.50%,0.50%,0.00%,77.78%,62.92%,Qwen,apache-2.0 -54,46.06%,Command-R-Plus (FC) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,5.43,2.63,7.01,3.81,78.58%,68.83%,91.50%,83.50%,70.50%,80.71%,90.86%,90.00%,82.00%,60.00%,58.89%,68.60%,61.82%,50.00%,45.83%,2.00%,3.50%,0.00%,1.50%,3.00%,100.00%,55.30%,Cohere For AI,cc-by-nc-4.0 -55,44.69%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.36,2.01,4.03,3.66,57.77%,67.58%,94.00%,22.50%,47.00%,53.84%,87.36%,92.00%,16.00%,20.00%,72.49%,64.34%,72.17%,12.50%,12.50%,2.62%,4.50%,0.00%,3.00%,3.00%,82.35%,80.97%,Mistral AI,Proprietary -56,44.24%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.76,1.39,2.81,3.41,56.19%,66.75%,92.50%,39.00%,26.50%,64.93%,87.21%,86.00%,64.00%,22.50%,68.00%,73.26%,65.53%,37.50%,37.50%,2.88%,4.50%,1.00%,3.50%,2.50%,76.47%,71.36%,Google,Proprietary -57,43.02%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,11.27,32.66,24.55,72.67%,60.67%,87.00%,78.50%,64.50%,76.00%,61.00%,94.00%,84.00%,65.00%,57.49%,67.44%,60.11%,50.00%,41.67%,2.63%,3.50%,4.00%,2.50%,0.50%,66.67%,38.82%,NousResearch,apache-2.0 -58,43.01%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,6.92,2.72,10.47,5.41,61.42%,71.67%,94.00%,10.50%,69.50%,63.64%,83.57%,94.00%,22.00%,55.00%,68.71%,75.19%,73.41%,6.25%,45.83%,1.50%,3.50%,0.00%,1.00%,1.50%,82.35%,45.93%,Mistral AI,Proprietary -59,42.75%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.74,2.05,4.47,3.63,63.33%,64.83%,86.00%,58.50%,44.00%,69.61%,77.93%,86.00%,62.00%,52.50%,60.53%,60.85%,65.05%,68.75%,50.00%,1.50%,2.50%,0.00%,1.50%,2.00%,88.24%,59.41%,Mistral AI,Proprietary -60,42.32%,Claude-3-Haiku-20240307 (FC),https://www.anthropic.com/news/claude-3-family,0.23,1.63,2.5,2.52,42.40%,74.08%,93.50%,2.00%,0.00%,48.41%,91.64%,96.00%,6.00%,0.00%,59.51%,79.07%,77.87%,0.00%,0.00%,24.50%,35.50%,11.50%,22.00%,29.00%,100.00%,28.56%,Anthropic,Proprietary -61,42.31%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.79,1.86,10.01,3.37,85.29%,77.17%,92.50%,87.00%,84.50%,89.07%,93.79%,92.00%,88.00%,82.50%,48.67%,77.13%,73.31%,87.50%,70.83%,0.25%,0.50%,0.00%,0.00%,0.50%,94.12%,6.66%,Mistral AI,Proprietary -62,42.17%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,3.43,10.0,8.0,75.85%,67.92%,88.00%,74.00%,73.50%,76.80%,80.21%,84.00%,78.00%,65.00%,50.56%,56.20%,61.73%,37.50%,66.67%,3.25%,4.00%,4.50%,2.50%,2.00%,83.33%,39.22%,Qwen,apache-2.0 -63,40.98%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.49,3.89,8.21,11.37,60.75%,73.50%,92.00%,40.00%,37.50%,69.14%,90.07%,88.00%,46.00%,52.50%,60.58%,77.52%,73.31%,75.00%,45.83%,0.00%,0.00%,0.00%,0.00%,0.00%,94.12%,40.79%,Databricks,Databricks Open Model -64,40.78%,Hammer2.0-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-0.5b,N/A,3.67,12.44,6.92,67.19%,63.25%,80.50%,67.00%,58.00%,70.11%,53.93%,84.00%,80.00%,62.50%,53.22%,51.94%,44.25%,56.25%,41.67%,0.50%,0.50%,0.00%,0.50%,1.00%,72.22%,66.25%,MadeAgents,cc-by-nc-4.0 -65,40.02%,Claude-3-Haiku-20240307 (Prompt),https://www.anthropic.com/news/claude-3-family,0.21,1.12,2.15,2.13,57.52%,77.08%,91.50%,38.50%,23.00%,55.62%,94.00%,90.00%,6.00%,32.50%,64.22%,77.13%,74.17%,56.25%,54.17%,1.62%,3.50%,0.00%,0.00%,3.00%,70.59%,42.14%,Anthropic,Proprietary -66,39.67%,FireFunction-v1 (FC),https://huggingface.co/fireworks-ai/firefunction-v1,N/A,2.43,4.83,3.93,42.90%,80.08%,91.50%,0.00%,0.00%,44.57%,88.29%,90.00%,0.00%,0.00%,69.56%,68.99%,71.79%,0.00%,0.00%,2.38%,5.00%,0.00%,2.00%,2.50%,94.12%,71.74%,Fireworks,Apache 2.0 -67,39.21%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,7.89,5.38,18.44,70.33%,76.83%,94.00%,72.00%,38.50%,60.63%,84.50%,92.00%,56.00%,10.00%,54.02%,78.29%,57.36%,31.25%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,77.78%,46.20%,Salesforce,cc-by-nc-4.0 -68,38.85%,GLM-4-9b-Chat (FC),https://huggingface.co/THUDM/glm-4-9b-chat,N/A,5.03,13.94,11.28,36.65%,65.08%,81.50%,0.00%,0.00%,46.00%,94.00%,90.00%,0.00%,0.00%,66.50%,71.32%,64.10%,0.00%,0.00%,3.50%,3.50%,4.00%,2.50%,4.00%,66.67%,79.65%,THUDM,glm-4 -69,38.48%,MiniCPM3-4B (Prompt),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,20.44,49.11,65.53,65.73%,63.42%,73.50%,63.00%,63.00%,50.59%,40.36%,34.00%,48.00%,80.00%,54.20%,45.35%,34.19%,43.75%,45.83%,2.00%,3.00%,3.50%,1.00%,0.50%,55.56%,74.49%,openbmb,Apache-2.0 -70,36.98%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,1.12,0.55,2.27,46.12%,57.50%,53.00%,34.50%,39.50%,59.11%,47.93%,86.00%,40.00%,62.50%,54.22%,41.47%,38.75%,56.25%,37.50%,1.00%,1.50%,0.50%,1.00%,1.00%,58.82%,78.59%,Nexusflow,Apache 2.0 -71,35.76%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,1.36,2.41,3.08,58.40%,47.58%,60.50%,66.50%,59.00%,56.32%,49.79%,68.00%,60.00%,47.50%,48.80%,48.06%,46.53%,62.50%,37.50%,1.38%,2.50%,1.50%,0.50%,1.00%,82.35%,53.07%,Google,Proprietary -72,34.24%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.29,8.88,20.8,60.79%,62.67%,83.00%,49.00%,48.50%,58.93%,47.71%,86.00%,42.00%,60.00%,47.76%,59.30%,61.73%,37.50%,33.33%,0.75%,1.50%,0.00%,1.00%,0.50%,77.78%,18.42%,Meta,Meta Llama 3 Community -73,31.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.91,1.91,0.85,3.48,27.06%,23.25%,74.00%,8.50%,2.50%,30.36%,52.93%,64.00%,2.00%,2.50%,58.18%,34.50%,64.20%,0.00%,4.17%,0.75%,0.50%,0.00%,1.50%,1.00%,58.82%,69.80%,Mistral AI,Proprietary -74,29.08%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,3.13,12.5,5.47,54.52%,51.08%,78.00%,46.50%,42.50%,52.39%,46.57%,76.00%,52.00%,35.00%,38.34%,47.67%,39.41%,18.75%,25.00%,0.50%,0.50%,1.00%,0.00%,0.50%,83.33%,21.02%,Qwen,apache-2.0 -75,27.45%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,6.23,16.77,14.39,47.92%,55.67%,54.00%,47.00%,35.00%,50.18%,58.71%,58.00%,54.00%,30.00%,33.19%,50.00%,48.62%,37.50%,37.50%,5.38%,5.00%,7.50%,5.00%,4.00%,94.44%,5.03%,Meta,Meta Llama 3 Community -76,27.20%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,4.67,11.84,10.63,25.08%,48.83%,24.50%,12.50%,14.50%,31.62%,53.00%,36.00%,30.00%,7.50%,45.27%,51.94%,52.90%,31.25%,25.00%,4.88%,7.00%,4.00%,4.50%,4.00%,100.00%,45.08%,Meta,Meta Llama 3 Community -77,25.12%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,6.65,14.46,13.9,40.96%,71.83%,85.00%,5.50%,1.50%,42.95%,77.79%,90.00%,4.00%,0.00%,37.54%,65.89%,53.56%,0.00%,0.00%,0.12%,0.50%,0.00%,0.00%,0.00%,100.00%,7.15%,Salesforce,cc-by-nc-4.0 -78,22.48%,DeepSeek-Coder-V2-Lite-Instruct (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,N/A,16.78,38.81,52.93,4.75%,0.00%,2.00%,3.50%,13.50%,33.18%,17.71%,42.00%,28.00%,45.00%,39.63%,1.94%,3.70%,6.25%,12.50%,0.12%,0.50%,0.00%,0.00%,0.00%,5.56%,96.54%,DeepSeek,DeepSeek License -79,22.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,3.86,7.15,11.95,16.90%,15.08%,52.00%,0.00%,0.50%,19.12%,22.50%,54.00%,0.00%,0.00%,43.40%,26.74%,18.42%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,38.89%,72.58%,Google,gemma-terms-of-use -80,20.46%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.21,18.2,34.15,27.60%,29.42%,33.50%,32.50%,15.00%,25.27%,34.07%,28.00%,34.00%,5.00%,31.36%,30.62%,7.50%,12.50%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,33.33%,59.93%,Meta,Meta Llama 3 Community \ No newline at end of file +1,74.24%,watt-tool-70B (FC),https://huggingface.co/watt-ai/watt-tool-70B/,N/A,3.4,12.61,7.7,84.06%,78.75%,94.00%,85.50%,78.00%,89.39%,98.57%,94.00%,90.00%,75.00%,77.65%,84.88%,83.48%,81.25%,66.67%,58.62%,67.00%,57.50%,48.50%,61.50%,94.44%,76.32%,Watt AI Lab,Apache-2.0 +2,72.02%,gpt-4o-2024-11-20 (Prompt),https://openai.com/index/hello-gpt-4o/,13.54,0.78,0.93,1.48,88.10%,79.42%,95.50%,94.00%,83.50%,89.38%,100.00%,94.00%,86.00%,77.50%,79.65%,83.72%,79.77%,87.50%,70.83%,47.62%,59.00%,41.00%,35.50%,55.00%,83.33%,83.76%,OpenAI,Proprietary +3,69.56%,gpt-4o-2024-11-20 (FC),https://openai.com/index/hello-gpt-4o/,8.23,1.11,1.73,2.29,87.42%,77.17%,93.50%,93.00%,86.00%,89.20%,88.29%,92.00%,94.00%,82.50%,79.61%,81.01%,78.82%,87.50%,75.00%,41.00%,62.50%,6.00%,37.50%,58.00%,83.33%,83.15%,OpenAI,Proprietary +4,67.94%,watt-tool-8B (FC),https://huggingface.co/watt-ai/watt-tool-8B/,N/A,1.31,2.79,4.04,86.56%,76.75%,95.00%,94.00%,80.50%,89.34%,97.86%,94.00%,88.00%,77.50%,76.37%,75.97%,77.49%,87.50%,66.67%,39.13%,47.00%,41.50%,27.50%,40.50%,83.33%,83.15%,Watt AI Lab,Apache-2.0 +5,67.87%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,33.22,2.47,6.27,5.08,84.73%,70.42%,91.00%,90.00%,87.50%,85.21%,87.36%,90.00%,86.00%,77.50%,80.45%,83.33%,78.63%,81.25%,70.83%,38.12%,54.00%,13.50%,35.50%,49.50%,72.22%,83.81%,OpenAI,Proprietary +6,66.68%,o1-2024-12-17 (Prompt),https://openai.com/o1/,102.47,5.3,4.29,13.0,85.67%,72.67%,93.50%,91.50%,85.00%,79.77%,58.57%,92.00%,86.00%,82.50%,80.45%,81.78%,76.54%,81.25%,70.83%,36.00%,50.50%,0.50%,48.50%,44.50%,72.22%,87.78%,OpenAI,Proprietary +7,64.09%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.51,1.49,9.88,3.01,85.21%,74.83%,92.00%,90.00%,84.00%,83.57%,83.29%,92.00%,84.00%,75.00%,74.37%,78.29%,76.16%,87.50%,70.83%,34.12%,47.50%,19.50%,29.00%,40.50%,83.33%,74.75%,OpenAI,Proprietary +8,62.76%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,29.76,8.44,10.06,17.57,78.92%,71.17%,89.00%,83.50%,72.00%,82.70%,89.29%,86.00%,78.00%,77.50%,78.05%,71.71%,71.60%,75.00%,79.17%,28.25%,40.50%,5.00%,34.50%,33.00%,61.11%,89.62%,OpenAI,Proprietary +9,62.63%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,14.06,57.4,35.06,89.88%,76.00%,97.00%,95.00%,91.50%,91.32%,99.29%,94.00%,92.00%,80.00%,76.59%,81.01%,83.29%,68.75%,75.00%,21.38%,31.50%,21.00%,26.50%,6.50%,72.22%,76.08%,MeetKai,MIT +10,62.13%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,7.05,5.94,33.79,6.47,88.58%,78.33%,93.50%,92.50%,90.00%,91.27%,98.57%,94.00%,90.00%,82.50%,76.54%,81.78%,77.40%,87.50%,79.17%,20.75%,23.00%,19.50%,17.50%,23.00%,72.22%,78.15%,Google,Proprietary +11,61.80%,Hammer2.1-7b (FC),https://huggingface.co/MadeAgents/Hammer2.1-7b,N/A,2.08,4.12,5.38,88.65%,78.08%,95.00%,93.50%,88.00%,85.48%,86.43%,92.00%,86.00%,77.50%,75.02%,76.36%,77.40%,81.25%,66.67%,23.50%,35.50%,25.50%,19.00%,14.00%,82.35%,78.59%,MadeAgents,cc-by-nc-4.0 +12,61.28%,Qwen2.5-72B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,N/A,3.72,6.88,9.64,90.81%,80.25%,97.50%,93.50%,92.00%,92.70%,99.29%,94.00%,90.00%,87.50%,75.21%,84.50%,82.15%,62.50%,75.00%,18.00%,24.50%,20.00%,15.50%,12.00%,100.00%,72.81%,Qwen,qwen +13,60.94%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,5.39,2.07,2.64,4.07,87.29%,73.17%,95.00%,91.50%,89.50%,84.61%,75.93%,94.00%,86.00%,82.50%,76.19%,79.46%,75.21%,87.50%,75.00%,21.62%,31.00%,5.00%,21.00%,29.50%,72.22%,76.90%,Google,Proprietary +14,60.83%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.84,1.31,6.89,2.39,86.77%,80.08%,90.50%,89.50%,87.00%,80.84%,62.86%,96.00%,82.00%,82.50%,76.32%,80.23%,76.73%,93.75%,75.00%,22.00%,33.00%,12.00%,17.00%,26.00%,83.33%,80.67%,OpenAI,Proprietary +15,60.44%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,7.0,1.54,4.69,2.38,85.56%,75.25%,91.50%,91.50%,84.00%,85.77%,91.57%,90.00%,84.00%,77.50%,76.63%,75.97%,71.98%,93.75%,75.00%,18.88%,26.00%,5.00%,21.50%,23.00%,55.56%,84.81%,Google,Proprietary +16,59.64%,Qwen2.5-32B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-32B-Instruct,N/A,2.26,4.62,5.92,85.81%,70.25%,94.50%,90.50%,88.00%,89.79%,96.64%,90.00%,90.00%,82.50%,74.14%,82.17%,78.54%,62.50%,58.33%,17.75%,25.00%,20.00%,15.00%,11.00%,100.00%,73.75%,Qwen,apache-2.0 +17,59.53%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,58.87,1.24,1.33,2.58,90.88%,82.50%,95.50%,93.50%,92.00%,89.45%,99.29%,96.00%,80.00%,82.50%,63.71%,87.21%,84.14%,100.00%,75.00%,30.25%,42.50%,25.00%,20.50%,33.00%,100.00%,35.57%,OpenAI,Proprietary +18,59.42%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,5.1,1.43,1.85,2.48,84.33%,69.83%,93.00%,92.00%,82.50%,87.95%,91.79%,92.00%,88.00%,80.00%,76.23%,75.58%,70.75%,81.25%,62.50%,16.00%,24.50%,3.00%,15.50%,21.00%,50.00%,84.39%,Google,Proprietary +19,59.03%,Hammer2.1-3b (FC),https://huggingface.co/MadeAgents/Hammer2.1-3b,N/A,1.95,4.31,5.09,86.85%,81.42%,95.00%,89.50%,81.50%,84.09%,82.86%,92.00%,84.00%,77.50%,73.91%,72.48%,73.31%,62.50%,62.50%,17.38%,27.50%,17.50%,14.50%,10.00%,82.35%,81.87%,MadeAgents,qwen-research +20,58.44%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,12.68,3.12,10.75,6.21,86.81%,74.25%,92.50%,90.00%,90.50%,84.38%,75.00%,94.00%,86.00%,82.50%,69.84%,84.88%,78.54%,62.50%,79.17%,23.75%,33.50%,18.00%,23.50%,20.00%,72.22%,52.85%,Mistral AI,Proprietary +21,58.39%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,5.24,15.7,9.8,87.54%,76.67%,93.50%,90.50%,89.50%,89.21%,97.36%,94.00%,88.00%,77.50%,78.50%,72.48%,76.73%,81.25%,70.83%,7.75%,7.50%,11.50%,5.00%,7.00%,83.33%,87.88%,Huawei Noah & USTC,Apache-2.0 +22,57.76%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,9.26,11.66,21.27,83.69%,77.75%,94.50%,86.50%,76.00%,87.88%,95.00%,94.00%,90.00%,72.50%,72.55%,79.46%,79.68%,81.25%,75.00%,16.25%,25.50%,16.00%,11.50%,12.00%,88.89%,67.81%,Salesforce,cc-by-nc-4.0 +23,57.62%,Qwen2.5-14B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-14B-Instruct,N/A,2.02,4.99,5.0,85.69%,73.25%,92.50%,92.00%,85.00%,88.84%,92.36%,90.00%,88.00%,85.00%,74.10%,74.03%,75.78%,62.50%,66.67%,12.12%,18.50%,11.50%,12.00%,6.50%,77.78%,77.06%,Qwen,apache-2.0 +24,57.20%,DeepSeek-V3 (FC),https://api-docs.deepseek.com/news/news1226,N/A,2.58,5.84,4.29,89.17%,78.67%,95.50%,91.00%,91.50%,83.39%,62.57%,94.00%,92.00%,85.00%,68.33%,82.95%,82.15%,81.25%,62.50%,18.62%,21.00%,20.50%,19.00%,14.00%,88.89%,59.36%,DeepSeek,DeepSeek License +25,57.08%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.48,0.71,1.04,1.09,85.69%,70.75%,90.00%,91.50%,90.50%,83.59%,80.36%,92.00%,82.00%,80.00%,68.86%,76.74%,76.16%,93.75%,79.17%,19.50%,27.50%,20.00%,12.00%,18.50%,83.33%,62.78%,Google,Proprietary +26,56.73%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.46,0.81,1.07,1.38,81.65%,73.58%,91.50%,90.00%,71.50%,80.64%,93.57%,92.00%,82.00%,55.00%,76.54%,80.62%,76.16%,93.75%,62.50%,12.50%,17.50%,6.00%,11.50%,15.00%,83.33%,78.49%,Google,Proprietary +27,56.43%,Claude-3.5-Sonnet-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,2.53,3.07,5.61,4.78,45.44%,78.75%,94.50%,3.50%,5.00%,47.89%,97.57%,90.00%,4.00%,0.00%,78.85%,83.33%,81.96%,25.00%,20.83%,41.00%,55.00%,19.00%,42.50%,47.50%,77.78%,74.04%,Anthropic,Proprietary +28,56.43%,Claude-3-Opus-20240229 (FC),https://www.anthropic.com/news/claude-3-family,20.15,9.46,9.94,17.14,57.92%,67.17%,93.00%,39.50%,32.00%,59.46%,80.36%,88.00%,42.00%,27.50%,77.92%,77.91%,75.78%,31.25%,37.50%,30.25%,41.50%,14.00%,33.50%,32.00%,61.11%,81.59%,Anthropic,Proprietary +29,56.38%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,18.44,35.32,51.23,86.75%,74.00%,94.50%,90.50%,88.00%,87.12%,89.50%,94.00%,90.00%,75.00%,73.66%,79.07%,78.16%,81.25%,62.50%,9.88%,17.00%,2.50%,14.00%,6.00%,77.78%,70.89%,MeetKai,MIT +30,56.25%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.3,0.73,1.05,1.28,81.75%,65.50%,91.50%,80.50%,89.50%,73.21%,68.86%,90.00%,54.00%,80.00%,77.97%,72.09%,70.18%,81.25%,79.17%,11.62%,19.00%,0.50%,10.50%,16.50%,55.56%,90.92%,Google,Proprietary +31,55.55%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.31,0.59,0.73,0.84,77.54%,65.17%,94.50%,73.00%,77.50%,74.80%,62.21%,88.00%,74.00%,75.00%,76.28%,75.19%,74.26%,62.50%,58.33%,13.87%,19.00%,3.50%,14.00%,19.00%,50.00%,79.72%,Google,Proprietary +32,55.49%,DeepSeek-Coder-V2 (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct-0724,N/A,29.53,108.9,59.61,89.44%,78.75%,94.50%,93.50%,91.00%,91.23%,96.43%,94.00%,92.00%,82.50%,73.43%,80.23%,77.02%,43.75%,70.83%,4.50%,7.50%,3.00%,4.00%,3.50%,88.89%,70.81%,DeepSeek,DeepSeek License +33,54.86%,Hammer2.1-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.1-1.5b,N/A,2.73,3.86,7.45,82.79%,74.67%,92.00%,84.50%,80.00%,83.39%,86.57%,90.00%,82.00%,75.00%,70.59%,70.93%,69.80%,50.00%,62.50%,10.50%,14.50%,12.50%,9.00%,6.00%,77.78%,79.27%,MadeAgents,cc-by-nc-4.0 +34,54.70%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,12.74,25.13,24.76,81.06%,74.25%,95.50%,81.00%,73.50%,79.88%,74.00%,96.00%,82.00%,67.50%,75.08%,71.32%,74.93%,50.00%,62.50%,10.00%,16.50%,8.50%,7.50%,7.50%,94.44%,77.11%,Salesforce,cc-by-nc-4.0 +35,54.52%,GoGoAgent,https://gogoagent.ai,N/A,2.66,3.08,5.56,86.23%,75.42%,93.00%,92.00%,84.50%,89.86%,95.43%,96.00%,88.00%,80.00%,73.92%,72.09%,75.40%,68.75%,66.67%,1.00%,1.50%,2.00%,0.50%,0.00%,77.78%,83.12%,BitAgent,Proprietary +36,54.46%,o1-2024-12-17 (FC),https://openai.com/o1/,68.63,4.86,5.1,13.75,40.23%,67.92%,93.00%,0.00%,0.00%,38.66%,60.64%,94.00%,0.00%,0.00%,77.92%,81.01%,79.01%,0.00%,0.00%,41.00%,52.50%,38.00%,30.50%,43.00%,72.22%,81.97%,OpenAI,Proprietary +37,54.26%,claude-3.5-haiku-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,0.48,1.84,5.21,3.57,83.19%,76.25%,93.00%,84.00%,79.50%,84.71%,97.86%,90.00%,76.00%,75.00%,70.64%,83.72%,75.02%,87.50%,54.17%,9.75%,16.00%,0.50%,8.00%,14.50%,77.78%,65.78%,Anthropic,Proprietary +38,54.09%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,4.95,14.92,14.75,89.98%,77.92%,96.00%,94.50%,91.50%,90.12%,94.00%,98.00%,86.00%,82.50%,62.06%,77.13%,76.16%,87.50%,62.50%,12.38%,16.50%,13.00%,10.50%,9.50%,100.00%,54.78%,Meta,Meta Llama 3 Community +39,53.88%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.38,0.87,1.45,1.47,83.94%,74.25%,93.50%,89.00%,79.00%,83.79%,96.14%,88.00%,86.00%,65.00%,63.93%,80.62%,79.68%,43.75%,58.33%,19.50%,32.50%,11.50%,21.50%,12.50%,94.44%,36.53%,OpenAI,Proprietary +40,53.66%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,4.54,11.64,11.02,86.46%,75.33%,94.50%,91.50%,84.50%,88.29%,92.14%,90.00%,86.00%,85.00%,67.35%,75.97%,74.93%,62.50%,70.83%,7.62%,9.50%,8.50%,7.00%,5.50%,88.89%,65.16%,Qwen,apache-2.0 +41,53.24%,claude-3.5-haiku-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,0.83,2.83,1.94,5.18,40.62%,68.00%,92.00%,2.50%,0.00%,50.46%,87.86%,90.00%,24.00%,0.00%,72.28%,82.17%,78.35%,18.75%,0.00%,40.00%,54.50%,26.50%,35.00%,44.00%,83.33%,63.68%,Anthropic,Proprietary +42,53.03%,FireFunction-v2 (FC),https://huggingface.co/fireworks-ai/firefunction-v2,N/A,2.13,1.17,3.87,88.46%,80.33%,94.00%,91.50%,88.00%,87.54%,96.64%,92.00%,84.00%,77.50%,65.57%,78.29%,78.35%,56.25%,70.83%,8.62%,13.50%,7.00%,11.00%,3.00%,94.44%,53.02%,Fireworks,Apache 2.0 +43,52.55%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,4.96,5.57,10.33,67.65%,73.58%,90.00%,69.00%,38.00%,74.05%,89.21%,90.00%,72.00%,45.00%,70.99%,74.03%,79.30%,43.75%,58.33%,15.50%,26.00%,13.00%,11.50%,11.50%,94.44%,67.15%,Salesforce,cc-by-nc-4.0 +44,52.17%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,11.07,4.57,18.11,9.97,73.12%,69.50%,88.50%,69.00%,65.50%,81.57%,93.29%,86.00%,72.00%,75.00%,77.52%,75.19%,74.07%,81.25%,54.17%,0.38%,1.00%,0.00%,0.00%,0.50%,66.67%,85.93%,Mistral AI,Proprietary +45,52.17%,Command R7B (FC),https://cohere.com/blog/command-r7b,0.1,1.35,4.86,2.47,81.67%,68.17%,91.50%,85.50%,81.50%,84.02%,87.07%,92.00%,82.00%,75.00%,69.21%,63.18%,58.69%,56.25%,66.67%,5.00%,6.50%,1.50%,6.50%,5.50%,55.56%,81.02%,Cohere,cc-by-nc-4.0 +46,52.17%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,4.97,8.07,13.9,88.94%,79.75%,92.50%,91.50%,92.00%,89.09%,87.86%,98.00%,88.00%,82.50%,67.04%,84.50%,79.39%,68.75%,62.50%,2.38%,4.50%,2.00%,1.50%,1.50%,94.44%,59.19%,Google,gemma-terms-of-use +47,51.75%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,3.65,3.82,10.68,87.81%,76.75%,95.00%,92.50%,87.00%,88.21%,95.86%,94.00%,78.00%,85.00%,64.90%,80.62%,78.25%,75.00%,66.67%,5.62%,10.00%,4.00%,6.00%,2.50%,100.00%,50.88%,Meta,Meta Llama 3 Community +48,51.73%,Ministral-8B-Instruct-2410 (FC),https://huggingface.co/mistralai/Ministral-8B-Instruct-2410,N/A,12.79,45.03,47.12,83.83%,71.83%,91.50%,84.50%,87.50%,79.57%,71.29%,86.00%,86.00%,75.00%,64.93%,75.19%,72.27%,62.50%,66.67%,11.25%,21.00%,8.50%,10.00%,5.50%,70.59%,55.28%,Mistral AI,Mistral AI Research License +49,51.66%,MiniCPM3-4B-FC (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,160.19,184.1,464.0,80.83%,69.83%,91.50%,82.50%,79.50%,87.57%,89.29%,90.00%,86.00%,85.00%,69.97%,74.42%,63.91%,43.75%,62.50%,2.62%,5.00%,1.00%,3.00%,1.50%,72.22%,72.22%,openbmb,Apache-2.0 +50,51.66%,Claude-3.5-Sonnet-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,1.23,1.81,1.35,3.3,72.48%,81.42%,92.00%,70.50%,46.00%,80.00%,100.00%,92.00%,68.00%,60.00%,71.88%,86.05%,80.06%,81.25%,45.83%,7.50%,9.00%,5.50%,5.00%,10.50%,77.78%,64.40%,Anthropic,Proprietary +51,51.55%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,5.23,8.66,13.19,85.29%,75.67%,90.50%,88.50%,86.50%,87.52%,88.07%,94.00%,88.00%,80.00%,67.84%,76.36%,74.26%,62.50%,62.50%,1.62%,2.00%,4.00%,0.50%,0.00%,83.33%,66.51%,Google,gemma-terms-of-use +52,51.37%,Llama-3.3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.98,25.27,23.42,85.08%,74.83%,94.50%,84.00%,87.00%,90.68%,95.71%,98.00%,84.00%,85.00%,62.59%,80.62%,77.11%,93.75%,62.50%,6.87%,9.00%,8.00%,4.50%,6.00%,100.00%,48.71%,Meta,Meta Llama 3 Community +53,51.32%,Claude-3-Opus-20240229 (Prompt),https://www.anthropic.com/news/claude-3-family,10.48,4.6,8.24,10.54,85.31%,79.75%,95.00%,85.50%,81.00%,86.32%,99.29%,90.00%,86.00%,70.00%,66.86%,84.11%,79.11%,68.75%,54.17%,7.13%,11.50%,2.50%,6.00%,8.50%,83.33%,40.25%,Anthropic,Proprietary +54,51.22%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.18,1.55,1.96,3.42,82.10%,64.42%,93.50%,85.50%,85.00%,77.66%,56.14%,94.00%,88.00%,72.50%,65.93%,77.13%,69.61%,75.00%,66.67%,9.12%,15.00%,3.50%,9.00%,9.00%,66.67%,63.19%,Mistral AI,Proprietary +55,50.70%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,10.3,52.75,24.53,84.21%,72.83%,93.50%,87.00%,83.50%,86.30%,83.71%,96.00%,88.00%,77.50%,60.95%,73.26%,73.31%,56.25%,50.00%,9.25%,12.00%,10.00%,7.00%,8.00%,77.78%,48.82%,Meta,Meta Llama 3 Community +56,50.33%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,12.83,1.36,6.03,3.17,88.02%,78.58%,94.00%,89.50%,90.00%,87.77%,93.57%,96.00%,84.00%,77.50%,65.93%,82.17%,72.65%,81.25%,75.00%,0.50%,1.00%,0.00%,0.00%,1.00%,83.33%,55.09%,Mistral AI,Proprietary +57,49.32%,Command-R-Plus (FC),https://txt.cohere.com/command-r-plus-microsoft-azure,7.8,2.58,9.12,3.87,77.02%,72.08%,89.50%,82.50%,64.00%,81.21%,90.86%,90.00%,84.00%,60.00%,58.91%,69.77%,58.78%,62.50%,45.83%,13.12%,16.50%,10.00%,9.00%,17.00%,72.22%,53.16%,Cohere For AI,cc-by-nc-4.0 +58,49.28%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,1.84,1.74,5.0,82.46%,72.83%,91.50%,84.00%,81.50%,86.36%,84.93%,92.00%,86.00%,82.50%,59.57%,67.83%,56.32%,43.75%,54.17%,3.38%,6.00%,1.50%,4.50%,1.50%,88.89%,74.82%,IBM,Apache-2.0 +59,48.29%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.16,0.72,1.79,1.21,72.85%,77.92%,93.50%,67.00%,53.00%,70.39%,57.57%,90.00%,74.00%,60.00%,68.46%,79.84%,78.63%,75.00%,58.33%,5.62%,9.00%,2.00%,7.00%,4.50%,94.44%,58.39%,OpenAI,Proprietary +60,47.27%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,3.97,3.91,8.21,76.79%,64.17%,89.50%,80.00%,73.50%,76.23%,70.43%,94.00%,78.00%,62.50%,64.90%,71.71%,65.81%,56.25%,50.00%,2.38%,4.50%,1.50%,2.00%,1.50%,44.44%,60.78%,NousResearch,apache-2.0 +61,47.23%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,24.91,3.32,3.99,6.94,90.54%,82.17%,97.00%,92.50%,90.50%,90.12%,100.00%,94.00%,84.00%,82.50%,52.69%,85.27%,81.96%,93.75%,79.17%,8.38%,15.00%,6.00%,6.00%,6.50%,100.00%,4.35%,Mistral AI,Proprietary +62,47.06%,Qwen2.5-3B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-3B-Instruct,N/A,1.03,1.43,1.78,80.79%,74.17%,90.50%,79.50%,79.00%,81.71%,80.86%,86.00%,80.00%,80.00%,58.60%,68.99%,66.48%,56.25%,62.50%,3.38%,5.50%,3.50%,2.00%,2.50%,88.89%,54.19%,Qwen,qwen +63,46.91%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,7.79,45.67,14.71,80.56%,73.75%,92.00%,80.50%,76.00%,83.70%,87.29%,92.00%,78.00%,77.50%,55.75%,63.57%,64.86%,12.50%,45.83%,5.25%,8.50%,2.50%,4.50%,5.50%,88.89%,51.69%,Meta,Meta Llama 3 Community +64,46.70%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,2.51,6.07,4.63,73.37%,71.00%,86.00%,70.00%,66.50%,85.61%,80.43%,94.00%,88.00%,80.00%,61.04%,70.16%,59.26%,56.25%,41.67%,1.12%,1.50%,2.50%,0.50%,0.00%,83.33%,63.04%,Qwen,apache-2.0 +65,45.27%,Hammer2.1-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.1-0.5b,N/A,1.29,3.16,2.85,69.12%,68.00%,83.00%,71.50%,54.00%,70.46%,68.36%,84.00%,82.00%,47.50%,62.86%,59.69%,58.02%,50.00%,45.83%,2.25%,4.00%,0.50%,3.00%,1.50%,77.78%,73.94%,MadeAgents,cc-by-nc-4.0 +66,44.83%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.76,1.38,2.9,3.4,56.65%,66.58%,95.00%,40.00%,25.00%,64.93%,87.21%,86.00%,64.00%,22.50%,69.57%,77.13%,67.62%,43.75%,41.67%,2.88%,4.50%,1.00%,3.50%,2.50%,66.67%,71.53%,Google,Proprietary +67,44.76%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.36,1.73,4.04,3.5,59.15%,67.58%,94.00%,24.50%,50.50%,53.84%,87.36%,92.00%,16.00%,20.00%,72.10%,64.73%,71.51%,12.50%,12.50%,2.62%,4.50%,0.00%,3.00%,3.00%,77.78%,80.86%,Mistral AI,Proprietary +68,43.12%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,10.63,32.72,23.76,73.06%,60.75%,87.50%,78.50%,65.50%,76.00%,61.00%,94.00%,84.00%,65.00%,57.62%,68.99%,60.02%,43.75%,41.67%,2.63%,3.50%,4.00%,2.50%,0.50%,66.67%,38.88%,NousResearch,apache-2.0 +69,43.07%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.74,1.73,4.5,3.51,63.58%,64.83%,86.00%,59.00%,44.50%,69.61%,77.93%,86.00%,62.00%,52.50%,61.39%,63.18%,66.10%,68.75%,50.00%,1.50%,2.50%,0.00%,1.50%,2.00%,88.89%,59.52%,Mistral AI,Proprietary +70,42.99%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,7.0,2.63,15.88,5.36,61.67%,71.67%,94.00%,10.50%,70.50%,63.64%,83.57%,94.00%,22.00%,55.00%,68.55%,76.36%,73.12%,6.25%,45.83%,1.50%,3.50%,0.00%,1.00%,1.50%,83.33%,45.71%,Mistral AI,Proprietary +71,42.53%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.79,1.65,10.01,3.26,86.12%,77.00%,93.50%,89.50%,84.50%,89.07%,93.79%,92.00%,88.00%,82.50%,48.96%,77.13%,74.45%,87.50%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,88.89%,6.43%,Mistral AI,Proprietary +72,42.30%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,3.99,10.26,9.78,76.65%,68.08%,88.00%,75.50%,75.00%,76.80%,80.21%,84.00%,78.00%,65.00%,50.60%,56.59%,62.01%,37.50%,66.67%,3.25%,4.00%,4.50%,2.50%,2.00%,88.89%,39.00%,Qwen,apache-2.0 +73,40.91%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.49,3.74,8.22,11.19,61.25%,73.50%,92.00%,42.50%,37.00%,69.14%,90.07%,88.00%,46.00%,52.50%,60.15%,77.13%,73.03%,75.00%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,94.44%,40.50%,Databricks,Databricks Open Model +74,39.97%,FireFunction-v1 (FC),https://huggingface.co/fireworks-ai/firefunction-v1,N/A,2.27,3.77,3.71,43.00%,80.00%,92.00%,0.00%,0.00%,44.57%,88.29%,90.00%,0.00%,0.00%,70.41%,71.32%,72.93%,0.00%,0.00%,2.38%,5.00%,0.00%,2.00%,2.50%,94.44%,71.80%,Fireworks,Apache 2.0 +75,39.25%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,6.26,4.43,13.94,72.08%,76.83%,93.50%,77.00%,41.00%,60.63%,84.50%,92.00%,56.00%,10.00%,53.35%,78.29%,58.02%,31.25%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,77.78%,44.95%,Salesforce,cc-by-nc-4.0 +76,38.94%,GLM-4-9b-Chat (FC),https://huggingface.co/THUDM/glm-4-9b-chat,N/A,6.09,15.35,13.2,36.67%,65.17%,81.50%,0.00%,0.00%,46.00%,94.00%,90.00%,0.00%,0.00%,66.77%,72.09%,64.39%,0.00%,0.00%,3.50%,3.50%,4.00%,2.50%,4.00%,66.67%,79.71%,THUDM,glm-4 +77,38.59%,MiniCPM3-4B (Prompt),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,20.78,49.16,64.58,65.88%,63.50%,72.50%,65.50%,62.00%,50.59%,40.36%,34.00%,48.00%,80.00%,54.46%,46.51%,34.76%,43.75%,41.67%,2.00%,3.00%,3.50%,1.00%,0.50%,50.00%,74.43%,openbmb,Apache-2.0 +78,36.92%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,1.13,0.55,2.27,45.88%,57.50%,53.00%,34.00%,39.00%,59.11%,47.93%,86.00%,40.00%,62.50%,54.15%,41.47%,38.65%,56.25%,37.50%,1.00%,1.50%,0.50%,1.00%,1.00%,61.11%,78.53%,Nexusflow,Apache 2.0 +79,35.69%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,1.33,2.38,2.97,57.31%,46.25%,56.50%,63.50%,63.00%,56.32%,49.79%,68.00%,60.00%,47.50%,49.09%,50.39%,47.01%,62.50%,29.17%,1.38%,2.50%,1.50%,0.50%,1.00%,77.78%,52.95%,Google,Proprietary +80,34.30%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.05,8.85,20.61,60.79%,62.67%,82.50%,48.00%,50.00%,58.93%,47.71%,86.00%,42.00%,60.00%,47.93%,60.85%,61.44%,37.50%,33.33%,0.75%,1.50%,0.00%,1.00%,0.50%,77.78%,18.59%,Meta,Meta Llama 3 Community +81,31.25%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.91,1.57,0.96,3.37,26.94%,23.25%,74.00%,8.50%,2.00%,30.36%,52.93%,64.00%,2.00%,2.50%,58.73%,36.05%,65.24%,0.00%,8.33%,0.75%,0.50%,0.00%,1.50%,1.00%,44.44%,69.74%,Mistral AI,Proprietary +82,29.27%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,3.09,11.89,5.41,54.29%,51.17%,79.00%,46.50%,40.50%,52.39%,46.57%,76.00%,52.00%,35.00%,39.00%,48.45%,40.27%,12.50%,25.00%,0.50%,0.50%,1.00%,0.00%,0.50%,94.44%,21.19%,Qwen,apache-2.0 +83,28.06%,Qwen2.5-0.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct,N/A,0.95,1.25,1.47,53.19%,58.25%,68.00%,53.50%,33.00%,61.89%,63.07%,70.00%,62.00%,52.50%,31.59%,53.88%,34.76%,56.25%,16.67%,0.00%,0.00%,0.00%,0.00%,0.00%,94.44%,16.44%,Qwen,apache-2.0 +84,27.58%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,5.79,16.83,13.08,48.21%,55.83%,54.00%,48.50%,34.50%,50.18%,58.71%,58.00%,54.00%,30.00%,33.45%,51.55%,49.00%,37.50%,41.67%,5.38%,5.00%,7.50%,5.00%,4.00%,94.44%,4.86%,Meta,Meta Llama 3 Community +85,27.13%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,5.44,12.05,12.13,25.29%,49.17%,24.50%,12.50%,15.00%,31.62%,53.00%,36.00%,30.00%,7.50%,44.96%,51.94%,52.61%,31.25%,25.00%,4.88%,7.00%,4.00%,4.50%,4.00%,100.00%,44.85%,Meta,Meta Llama 3 Community +86,24.95%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,6.26,14.51,13.84,41.17%,71.67%,86.00%,5.00%,2.00%,42.95%,77.79%,90.00%,4.00%,0.00%,36.92%,63.95%,53.37%,6.25%,0.00%,0.12%,0.50%,0.00%,0.00%,0.00%,100.00%,6.69%,Salesforce,cc-by-nc-4.0 +87,22.43%,DeepSeek-Coder-V2-Lite-Instruct (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,N/A,13.9,30.47,39.18,4.88%,0.00%,1.50%,3.50%,14.50%,33.18%,17.71%,42.00%,28.00%,45.00%,39.40%,2.33%,3.80%,0.00%,8.33%,0.12%,0.50%,0.00%,0.00%,0.00%,0.00%,96.31%,DeepSeek,DeepSeek License +88,22.36%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,3.78,7.03,11.84,17.10%,15.42%,52.00%,0.00%,1.00%,19.12%,22.50%,54.00%,0.00%,0.00%,43.76%,26.36%,18.52%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,38.89%,73.03%,Google,gemma-terms-of-use +89,20.59%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.08,17.77,32.86,28.44%,29.25%,33.50%,36.00%,15.00%,25.27%,34.07%,28.00%,34.00%,5.00%,31.36%,31.40%,7.60%,12.50%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,38.89%,59.70%,Meta,Meta Llama 3 Community \ No newline at end of file diff --git a/leaderboard.html b/leaderboard.html index b78e26fb7..0dddc57f7 100644 --- a/leaderboard.html +++ b/leaderboard.html @@ -113,7 +113,7 @@
- Models are evaluated using commit d7e52e5. + Models are evaluated using commit 0cea216. All the model response we obtained is available here. To reproduce the results, please checkout our codebase at - this checkpoint. + this checkpoint.