From e4aa1c7e094e845a99ec57d493df364fa406384e Mon Sep 17 00:00:00 2001 From: George Tsoukalas Date: Thu, 7 Nov 2024 16:32:49 +0000 Subject: [PATCH 1/4] Update Lean count. --- docs/leaderboard.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/leaderboard.html b/docs/leaderboard.html index 151ebf63..c04c08cd 100644 --- a/docs/leaderboard.html +++ b/docs/leaderboard.html @@ -134,7 +134,7 @@

- + >
- + Date: Thu, 7 Nov 2024 16:33:00 +0000 Subject: [PATCH 2/4] Update Lean count. --- docs/index.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.html b/docs/index.html index 164f145d..180fa442 100644 --- a/docs/index.html +++ b/docs/index.html @@ -176,8 +176,8 @@

Abstract

We present PutnamBench, a new multilingual benchmark for evaluating the ability of neural theorem-provers to solve competition mathematics problems.

- PutnamBench consists of 1697 hand-constructed formalizations of problems sourced from the William Lowell Putnam Mathematical Competition, the premier undergraduate-level mathematics competition in North America. - All the theorems, of which there are 640, have formalizations in Lean 4 and Isabelle; a substantial subset also has Coq formalizations. + PutnamBench consists of 1696 hand-constructed formalizations of problems sourced from the William Lowell Putnam Mathematical Competition, the premier undergraduate-level mathematics competition in North America. + There are 644 problems formalized in Lean 4, 640 formalized in Isabelle, and 412 formalized in Coq.

Proving the theorems requires significant problem-solving ability and proficiency in a broad range of topics taught in undergraduate mathematics courses. We use PutnamBench to evaluate several established neural and symbolic theorem-provers. From 685cf47dcf208f128ea90a6816b374c8414e57c8 Mon Sep 17 00:00:00 2001 From: George Tsoukalas Date: Thu, 7 Nov 2024 16:33:25 +0000 Subject: [PATCH 3/4] Sort topic counts. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a374a566..8bf8ac9d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PutnamBench +# [PutnamBench](https://arxiv.org/abs/2407.11214)

@@ -32,9 +32,9 @@ We also report the number of problems in a certain category. Note that some prob | Number Theory | 108 | | Geometry | 69 | | Linear Algebra | 51 | -| Abstract Algebra | 28 | | Combinatorics | 29 | -| Probability | 10 | +| Abstract Algebra | 28 | +| Probability | 10 | | Set Theory | 8 | ## Citation From 7e3b8ec54a75be94c2fa59bbbc8d749330f92501 Mon Sep 17 00:00:00 2001 From: George Tsoukalas Date: Thu, 7 Nov 2024 16:33:56 +0000 Subject: [PATCH 4/4] Update with InternLM2.5; Add solved problems and commits where possible. --- docs/results.json | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/docs/results.json b/docs/results.json index 58213303..c25526e3 100644 --- a/docs/results.json +++ b/docs/results.json @@ -7,7 +7,7 @@ }, "size": 0, "condensed-compute-budget": "pass@10", - "note": "pass@10 w/ T=0.1" + "note": "pass@10 w/ T=0.1; 1971 B1, 1986 B1, 1995 B1, 2001 A1." }, "GPT-4o": { "link": "https://openai.com/index/hello-gpt-4o/", @@ -19,7 +19,7 @@ }, "size": 0, "condensed-compute-budget": "pass@10", - "note": "pass@10 w/ T=0.7" + "note": "pass@10 w/ T=0.7; 1988 B1." }, "COPRA (GPT-4o)": { "link": "https://arxiv.org/abs/2310.04353", @@ -30,7 +30,7 @@ }, "size": 0, "condensed-compute-budget": "pass@1", - "note": "pass@1 w/ T=0.0" + "note": "pass@1 w/ T=0.0; 1988 B1." }, "CoqHammer": { "link": "https://coqhammer.github.io/", @@ -50,7 +50,7 @@ }, "size": 0, "condensed-compute-budget": "pass@1", - "note": "pass@1 w/ t=120s" + "note": "pass@1 w/ t=120s; 1971 B1, 2001 A1, 2012 A2." }, "ReProver w/ retrieval": { "link": "https://arxiv.org/abs/2306.15626", @@ -100,6 +100,16 @@ }, "size": 7, "condensed-compute-budget": "pass@596", - "note": "7 hour cumulative online proof search on 256 GPUs" + "note": "7 hour cumulative online proof search on 256 GPUs; 1977 A3, 1986 A1, 1986 A2, 1986 B1, 1988 B1, 1988 B2, 2001 A1. Commit: 8aaccf8" + }, + "InternLM2.5-StepProver": { + "link": "https://arxiv.org/pdf/2410.15700", + "open-data": "FULL", + "num-solved": { + "lean-wsolution": 6 + }, + "size": 8.8, + "condensed-compute-budget": "pass@2x32x600", + "note": "pass @ 2 x 32 states x 600 state expansions proof search; 1977 A3, 1986 A1, 1986 B1, 1988 B1, 1988 B2, 2001 A1. Commit: b377431" } } \ No newline at end of file