From d77e444570447a6e8e2b2238945471c17d8bb135 Mon Sep 17 00:00:00 2001 From: George Tsoukalas Date: Mon, 21 Oct 2024 14:55:04 +0000 Subject: [PATCH 1/4] Add ABEL to leaderboard. --- docs/results.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/results.json b/docs/results.json index 84706eba..58213303 100644 --- a/docs/results.json +++ b/docs/results.json @@ -91,5 +91,15 @@ "size": 7, "condensed-compute-budget": "pass@4096", "note": "pass@2048 w/ T = 1.0 + pass@2048 w/ T = 0.7" + }, + "ABEL": { + "link": "https://openreview.net/forum?id=kk3mSjVCUO", + "open-data": "NONE", + "num-solved": { + "lean-wsolution": 7 + }, + "size": 7, + "condensed-compute-budget": "pass@596", + "note": "7 hour cumulative online proof search on 256 GPUs" } } \ No newline at end of file From bf838619f0600f95751ad2708d0f7a288f748f65 Mon Sep 17 00:00:00 2001 From: George Tsoukalas Date: Mon, 21 Oct 2024 14:55:32 +0000 Subject: [PATCH 2/4] Update READMEs --- README.md | 20 ++++++++------------ coq/README.md | 3 +-- informal/README.md | 2 ++ 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index eadbf458..a374a566 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@

-PutnamBench is a benchmark for evaluation of theorem-proving algorithms on competition mathematics problems sourced from the William Lowell Putnam Mathematical Competition years 1962 - 2023. Our formalizations currently support three formal languages : Lean 4 $\land$ Isabelle $\land$ Coq. PutnamBench comprises of 1697 manually-crafted formalizations, aggregated over all languages. +PutnamBench is a benchmark for evaluation of theorem-proving algorithms on competition mathematics problems sourced from the William Lowell Putnam Mathematical Competition years 1962 - 2023. Our formalizations currently support three formal languages : Lean 4 $\land$ Isabelle $\land$ Coq. PutnamBench comprises of 1696 manually-crafted formalizations, aggregated over all languages. PutnamBench aims to support research in automated mathematical reasoning by providing a multilingual benchmark for evaluating theorem-proving algorithms. It is released under permissive licenses (Apache 2.0 for Lean 4 and Isabelle, MIT for Coq). The [informal statements](informal/README.md) are also available with permission from the MAA. @@ -14,14 +14,14 @@ PutnamBench includes factored solutions for problems which require exhibiting a We are hosting a [**leaderboard**](https://trishullab.github.io/PutnamBench/leaderboard.html) and will readily receive evaluation results which are accompanied by a preprint or publication. **Do not** include proofs as confirmation in any public setting. Please reach out privately at `george.tsoukalas@utexas.edu` with any requests for additions to the leaderboard. -**We strongly encourage community feedback!** Please let us know if you have any comments for improving PutnamBench. If you notice any mistakes, please raise an issue on the repository and we will address it. We kindly ask that you do not write formal proofs for any of the problems in an effort to reduce contamination. +**We strongly encourage community feedback!** Please let us know if you have any comments for improving PutnamBench. If you notice any mistakes, please raise an issue on the repository and we will address it. We kindly ask that you do not write formal proofs for any of the problems in an effort to reduce contamination. If you do wish to write formal proofs for a subset of the problems, we please ask that you first engage in discussion with us. ## Statistics | Language | Count | | ------------- | -------------- | -| Lean 4 | 640 | +| Lean 4 | 644 | | Isabelle | 640 | -| Coq | 417 | +| Coq | 412 | We also report the number of problems in a certain category. Note that some problems fall under multiple categories. While the categories are intended to capture general features of the problems, we also note that there is a high variance of problems inside an individual category. @@ -29,18 +29,14 @@ We also report the number of problems in a certain category. Note that some prob | ---------------- | -------------- | | Algebra | 253 | | Analysis | 226 | -| Number Theory | 107 | -| Geometry | 68 | +| Number Theory | 108 | +| Geometry | 69 | | Linear Algebra | 51 | | Abstract Algebra | 28 | -| Combinatorics | 26 | -| Probability | 9 | +| Combinatorics | 29 | +| Probability | 10 | | Set Theory | 8 | -## Versioning -- Version: `v0` -- In preliminary release to allow for initial community feedback. We seek to release an official first version following several weeks of discussion with the community. - ## Citation The associated paper for PutnamBench is [available at this link](https://arxiv.org/abs/2407.11214). Please consider including the following citation if you find PutnamBench useful. ``` diff --git a/coq/README.md b/coq/README.md index 70243bcf..4526175b 100644 --- a/coq/README.md +++ b/coq/README.md @@ -1,6 +1,5 @@ -Note: We are continuing to make modifications to the Coq formalizations, in particular those which were recently added. We encourage feedback, but keep this in mind. The format of the formalizations will be standardized with the other languages upon completion. +Note that roughly half of the Coq formalizations rely on Mathcomp, while the other half rely on a combination of the Coq Standard Library, Coquelicot, and other various Coq repositories. We also note that while the dependencies listed in each formalization are sufficient to *state* the problem, one may need further mathematical theory developed in Coq to write a proof. -We also note that while the dependencies listed in each formalization are sufficient to *state* the problem, one may need further mathematical theory developed in Coq to write a proof. # Installation You need to install `opam` and then run `setup.sh` to install the necessary dependencies. diff --git a/informal/README.md b/informal/README.md index b51b9eeb..2e898b17 100644 --- a/informal/README.md +++ b/informal/README.md @@ -8,3 +8,5 @@ that is, you must write: These statements are also present in the docstrings of some of the formal statements. Github CI will automatically catch if these are out of sync, but if you update them in one place you will have to update them in the other. + +When a problem happens to require a solution (i.e. some additional data requested in the problem statement), we also add an `informal_solution` field which has as value an imperative sentence roughly of the form "Show that the solution is ..." From 1dd8c881e2ae0ce6e9e1f50d2de99773e877eb92 Mon Sep 17 00:00:00 2001 From: George Tsoukalas Date: Mon, 21 Oct 2024 14:56:03 +0000 Subject: [PATCH 3/4] Remove old Coq formalizations --- coq/src/commented_problems.v | 186 ----------------------------------- 1 file changed, 186 deletions(-) delete mode 100644 coq/src/commented_problems.v diff --git a/coq/src/commented_problems.v b/coq/src/commented_problems.v deleted file mode 100644 index 13b26c0c..00000000 --- a/coq/src/commented_problems.v +++ /dev/null @@ -1,186 +0,0 @@ - -(* TODO: WIP *) -(* NOTE -- Divide-and-conquer recursion is hard to formulate in Coq. A proof must be provided for termination *) -(* Section putnam_2013_b1. -Require Import ZArith Nat List Lia Ensembles Finite_sets Reals Program Coquelicot.Hierarchy. -Open Scope Z. -Program Fixpoint Aa (n : nat) {measure n} : Z := - match n with - | O => 0 - | S O => 1 - | S (S m) => if even m then Aa (div2 (m+2)) else if even (div2 (m+2)) then Aa (div2 (m+2)) else (-1) * Aa (div2 (m+2)) - end. -Next Obligation. Proof. destruct m. simpl; auto. induction m. simpl; auto. simpl. Admitted. -Theorem putnam_2013_b1: - sum_n (fun n => (Aa n)*(Aa (n+2))) 2013 = 1. -Proof. Admitted. -End putnam_2013_b1. *) - -(* Skipped due to lack of surface integral function *) -(* Section putnam_2019_a4. -Require Import PeanoNat. Require Import Reals Coquelicot.Derive. -Definition putnam_2019_a4_solution := false. -Theorem putnam_2019_a4: - forall (f: R -> R), - continuity f -> - forall (x y z: R), x*x + y*y + z*z = 1 -> - True. -Proof. Admitted. -End putnam_2019_a4. *) - -(* TODO: missing determinant refinement in coqeal *) -(* Section putnam_2023_b6. -Require Import Nat Finite_sets. -From mathcomp Require Import matrix ssrbool ssralg fintype. -Variable putnam_2023_b6_solution : nat -> nat. -Open Scope ring_scope. -Theorem putnam_2023_b6: - forall (n: nat), - let s (i j: nat) := cardinal (nat*nat) (fun p => let (a, b) := p in 1 <= i <= n /\ 1 <= j <= n /\ eq (add (mul a i) (mul b j)) n) in - (\matrix_(i < n, j < n) s i j) - = (\matrix_(i < n, j < n) s i j). -Proof. Admitted. -End putnam_2023_b6. *) - -(* TODO: How to get the cardinality of a set with cardinal? Could not figure out a clean way*) -(* Section putnam_1973_a6. -Require Import Reals Finite_sets Ensembles Coquelicot.Coquelicot. From mathcomp Require Import fintype. -Theorem putnam_1973_a6 - (h_nint : nat -> ('I_7 -> (R * R)) -> nat := fun n lines => - let intersection_set (p : R * R) : Prop := exists! S : Ensemble 'I_7, cardinal _ S n /\ (forall i : 'I_7, In _ S i -> (snd p = (snd (lines i)) * (fst p) + (fst (lines i)))) in - cardinal _ intersection_set - ) - : ~ (exists lines: 'I_7 -> (R * R), (forall i j : 'I_7, i <> j -> (lines i <> lines j)) /\ h_nint 3 lines >= 6 /\ h_nint 2 lines >= 4). -Proof. Admitted. -End putnam_1973_a6. *) - -(* Section putnam_1999_a5. -Require Import Reals NewtonInt. From mathcomp Require Import all_algebra all_ssreflect ssrnat ssrnum ssralg fintype poly seq. -Open Scope ring_scope. -Theorem putnam_1999_a5: - forall (R: numDomainType) (p: {poly R}), - (size p = 1999%nat) -> - exists (C: R), Num.norm p.[0] <= GRing.mul C (Num.norm p.[0]). -Proof. Admitted. -End putnam_1999_a5. *) - -(* Section putnam_2010_a5. -Require Import Reals. From mathcomp Require Import fingroup ssreflect ssrbool eqtype seq choice fintype div path tuple bigop prime finset. -Open Scope R. -Variable R3: finGroupType. -Definition cross_product (a b : R -> R -> R) : R -> R -> R := a. -Theorem putnam_2010_a5: - forall (G: {group R3}), - forall (a b: R -> R -> R), - cross_product a b = a \/ cross_product a b = a -> - forall (a b: R -> R -> R), - cross_product a b = a. -Proof. Admitted. -End putnam_2010_a5. *) - -(* From mathcomp Require Import matrix ssralg ssrbool. -Open Scope ring_scope. -Definition putnam_1991_a2_solution := False. -Theorem putnam_1991_a2 - (R : comUnitRingType) - (n : nat) - (npos : n >= 1) - : (exists A B : 'M[R]_n, A <> B /\ mulmx (mulmx A A) A = mulmx (mulmx B B) B /\ - mulmx (mulmx A A) B = mulmx (mulmx B B) A /\ - (mulmx A A + mulmx B B) \in unitmx) <-> putnam_1991_a2_solution. -Proof. Admitted. *) - -(* From mathcomp Require Import ssrnat ssrnum ssralg poly polydiv seq. -Open Scope ring_scope. -Definition putnam_1992_b4_solution := 3984%nat. -Theorem putnam_1992_b4 - (R : numDomainType) - (itercomp := fix iter (f : {poly R} * {poly R} -> {poly R} * {poly R}) (n : nat) (p : {poly R} * {poly R}) : {poly R} * {poly R} := - match n with - | O => p - | S n' => f (iter f n' p) - end) - (qr : {poly R} * {poly R} -> {poly R} * {poly R} := fun duple => (deriv (fst duple) * snd duple - deriv (snd duple) * fst duple, snd duple * snd duple)) - (valid : {poly R} -> Prop := fun p => p <> 0 /\ lt (size p) 1992 /\ exists c : R, gcdp_rec p ('X^3 - 'X) = polyC c) - (twople : {poly R} -> {poly R} -> Prop := fun p f => exists g : {poly R}, g * fst (itercomp qr 1992%nat (p, 'X^3 - 'X)) = f * snd (itercomp qr 1992%nat (p, 'X^3 - 'X))) - (min : nat) - (hmineq : exists p f : {poly R}, (valid p /\ twople p f) /\ size f = min) - (hminlb : forall p f : {poly R}, (valid p /\ twople p f) -> ge (size f) min) - : min = putnam_1992_b4_solution. -Proof. Admitted. *) - -(* Require Import Reals -GeoCoq.Main.Tarski_dev.Ch16_coordinates_with_functions -GeoCoq.Axioms.Definitions -GeoCoq.Main.Highschool.triangles. -Context `{T2D:Tarski_2D} `{TE:@Tarski_euclidean Tn TnEQD}. -Open Scope R. -Definition putnam_2003_b5_solution (pt_to_R : Tpoint -> (R * R)) (dist : Tpoint -> Tpoint -> R) (P Op : Tpoint) := sqrt 3 * (1 - (dist P Op) ^ 2 - 1). -Theorem putnam_2003_b5 - (pt_to_R : Tpoint -> (R * R)) - (F_to_R : F -> R) - (dist : Tpoint -> Tpoint -> R := fun A B => let (a, b) := pt_to_R A in let (c, d) := pt_to_R B in dist_euc a b c d) - (Triangle : Tpoint -> Tpoint -> Tpoint -> Prop := fun x y z => ~ Col x y z) (* copied from GeoCoq.Axioms.euclidean_axioms *) - (A B C Op Op' P: Tpoint) - (fixpoint : dist Op Op' = R1) - (hABC : OnCircle A Op Op' /\ OnCircle B Op Op' /\ OnCircle C Op Op') - (hABC' : Main.Highschool.triangles.equilateral A B C) - (hp : InCircle P Op Op') - (a : R := dist P A) - (b : R := dist P B) - (c : R := dist P C) - : exists (A' B' C' : Tpoint) (D: Cs O E A' B' C'), - Triangle A' B' C' /\ dist A' B' = a /\ dist B' C' = b /\ dist C' A' = c /\ - F_to_R (signed_area A' B' C' D A' B' C') = (putnam_2003_b5_solution pt_to_R dist P Op). -Proof. Admitted. *) - -(* Require Import Reals Rgeom ZArith -GeoCoq.Main.Tarski_dev.Ch16_coordinates_with_functions -GeoCoq.Main.Annexes.midpoint_theorems -GeoCoq.Main.Highschool.circumcenter. -Context `{T2D:Tarski_2D} `{TE:@Tarski_euclidean Tn TnEQD}. -Open Scope R. -Definition putnam_1997_a1_solution := 28. -Theorem putnam_1997_a1 - (pt_to_R : Tpoint -> (R * R)) - (dist : Tpoint -> Tpoint -> R := fun A B => let (a, b) := pt_to_R A in let (c, d) := pt_to_R B in dist_euc a b c d) - (A B C : Tpoint) - (Hp Op Mp Fp : Tpoint) - (l1 : dist Hp Op = 11) - (l2 : dist Op Mp = 5) - (s : Rectangle Hp Op Mp Fp) - (hHp : Bet A Fp Hp) - (hOp : is_circumcenter Op A B C) - (hMp : Midpoint B C Mp) - (hFp : Perp A C B Fp /\ Col A C Fp) - : dist B C = putnam_1997_a1_solution. -Proof. Admitted. *) - -(* Require Import Nat Reals Coquelicot.Coquelicot ZArith. -Theorem putnam_1997_b4 - (a : nat -> nat -> Z) - (max_degree : nat -> nat) - (coeff : nat -> (nat -> R)) - (hpoly : forall (m : nat) (x : R), sum_n (fun i => (coeff m i) * (x^i)) (max_degree m) = (1 + x + x^2)^m) - (ha : forall m n : nat, IZR (a m n) = coeff m n) - : forall k : nat, ge k 0 -> 0 <= (sum_n (fun i => (-1)^i * (IZR (a (Nat.sub k i) i))) (Z.to_nat (Coquelicot.Rcomplements.floor (2 * INR k / 3)))) <= 1. -Proof. Admitted. *) - -(* From mathcomp Require Import ssralg ssrnum fintype seq poly. -Open Scope ring_scope. -Variable (R: numDomainType). -Definition putnam_1985_a6_solution : {poly R} := 6%:R *: 'X^2 + 5%:R *: 'X + 1%:R. -Theorem putnam_1985_a6 - (g : {poly R} := 3%:R *: 'X^2 + 7%:R *: 'X + 2%:R) - (Comp_poly_n := fix comp_poly_n (p : {poly R}) (n : nat) : {poly R} := - match n with - | O => 1 - | S n' => comp_poly (comp_poly_n p n') p - end) - : forall (f: {poly R}), f`_0 = 0 -> - forall (n: nat), - let F : {poly R} := Comp_poly_n f n in - let G : {poly R} := Comp_poly_n g n in - (\sum_(i < size F) F`_i) = (\sum_(i < size G) G`_i) - <-> f = putnam_1985_a6_solution. -Proof. Admitted. *) \ No newline at end of file From e9f4b877569221d3f8a880b2b29a5cd30b322915 Mon Sep 17 00:00:00 2001 From: George Tsoukalas Date: Mon, 21 Oct 2024 14:56:13 +0000 Subject: [PATCH 4/4] Add 1 new Coq formalization --- coq/src/putnam_1985_a6.v | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 coq/src/putnam_1985_a6.v diff --git a/coq/src/putnam_1985_a6.v b/coq/src/putnam_1985_a6.v new file mode 100644 index 00000000..b879d8df --- /dev/null +++ b/coq/src/putnam_1985_a6.v @@ -0,0 +1,17 @@ +From mathcomp Require Import all_ssreflect all_algebra. +From mathcomp Require Import reals. + +Set Implicit Arguments. +Unset Strict Implicit. +Unset Printing Implicit Defensive. + +Open Scope ring_scope. + +Variable R : realType. +Definition putnam_1985_a6_solution : {poly R} := 6 * 'X ^ 2 + 5 * 'X + 1. +Theorem putnam_1985_a6 + (Gamma : {poly R} -> R := fun p => \sum_(i <- p) (i ^+ 2)) + (f : {poly R} := 3 * 'X ^ 2 + 7 * 'X + 2) + : let g := putnam_1985_a6_solution in + g.[0] = 1 /\ forall n : nat, ge n 1 -> Gamma (f ^ n) = Gamma (g ^ n). +Proof. Admitted. \ No newline at end of file