From d71a43b0f4dc95698d5fee0c905399d99f679c75 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Sat, 13 Jul 2024 00:35:10 +0200
Subject: [PATCH 1/9] add gr_mat_set_gr_mat_other

---
 doc/source/gr_mat.rst         |  1 +
 src/gr/matrix.c               | 21 +--------------------
 src/gr_mat.h                  |  1 +
 src/gr_mat/set_gr_mat_other.c | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 37 insertions(+), 20 deletions(-)
 create mode 100644 src/gr_mat/set_gr_mat_other.c

diff --git a/doc/source/gr_mat.rst b/doc/source/gr_mat.rst
index 9a2ac83670..e4a039cdfd 100644
--- a/doc/source/gr_mat.rst
+++ b/doc/source/gr_mat.rst
@@ -154,6 +154,7 @@ Assignment and special values
 .. function:: int gr_mat_set(gr_mat_t res, const gr_mat_t mat, gr_ctx_t ctx)
               int gr_mat_set_fmpz_mat(gr_mat_t res, const fmpz_mat_t mat, gr_ctx_t ctx)
               int gr_mat_set_fmpq_mat(gr_mat_t res, const fmpq_mat_t mat, gr_ctx_t ctx)
+              int gr_mat_set_gr_mat_other(gr_mat_t res, const gr_mat_t mat, gr_ctx_t mat_ctx, gr_ctx_t ctx)
 
     Sets *res* to the value of *mat*.
 
diff --git a/src/gr/matrix.c b/src/gr/matrix.c
index 9a0e6aab4c..ba1af2ad0a 100644
--- a/src/gr/matrix.c
+++ b/src/gr/matrix.c
@@ -221,9 +221,6 @@ matrix_set_other(gr_mat_t res, gr_srcptr x, gr_ctx_t x_ctx, gr_ctx_t ctx)
     else if (x_ctx->which_ring == GR_CTX_GR_MAT)
     {
         const gr_mat_struct * xmat = x;
-        slong i, j;
-        int status;
-        slong sz, xsz;
 
         if (res->r != xmat->r || res->c != xmat->c)
         {
@@ -233,24 +230,8 @@ matrix_set_other(gr_mat_t res, gr_srcptr x, gr_ctx_t x_ctx, gr_ctx_t ctx)
                 return GR_DOMAIN;
         }
 
-        sz = MATRIX_CTX(ctx)->base_ring->sizeof_elem;
-        xsz = MATRIX_CTX(x_ctx)->base_ring->sizeof_elem;
-
-        for (i = 0; i < xmat->r; i++)
-        {
-            for (j = 0; j < xmat->c; j++)
-            {
-                status = gr_set_other(GR_MAT_ENTRY(res, i, j, sz),
-                            GR_MAT_ENTRY(xmat, i, j, xsz),
-                            MATRIX_CTX(x_ctx)->base_ring,
+        return gr_mat_set_gr_mat_other(res, xmat, MATRIX_CTX(x_ctx)->base_ring,
                             MATRIX_CTX(ctx)->base_ring);
-
-                if (status != GR_SUCCESS)
-                    return status;
-            }
-        }
-
-        return GR_SUCCESS;
     }
     else
     {
diff --git a/src/gr_mat.h b/src/gr_mat.h
index c8c722a4cc..fdcd7f7f61 100644
--- a/src/gr_mat.h
+++ b/src/gr_mat.h
@@ -118,6 +118,7 @@ WARN_UNUSED_RESULT int gr_mat_set_fmpq(gr_mat_t res, const fmpq_t v, gr_ctx_t ct
 
 WARN_UNUSED_RESULT int gr_mat_set_fmpz_mat(gr_mat_t res, const fmpz_mat_t mat, gr_ctx_t ctx);
 WARN_UNUSED_RESULT int gr_mat_set_fmpq_mat(gr_mat_t res, const fmpq_mat_t mat, gr_ctx_t ctx);
+WARN_UNUSED_RESULT int gr_mat_set_gr_mat_other(gr_mat_t res, const gr_mat_t mat, gr_ctx_t mat_ctx, gr_ctx_t ctx);
 
 WARN_UNUSED_RESULT int gr_mat_neg(gr_mat_t res, const gr_mat_t mat, gr_ctx_t ctx);
 WARN_UNUSED_RESULT int gr_mat_swap_entrywise(gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx);
diff --git a/src/gr_mat/set_gr_mat_other.c b/src/gr_mat/set_gr_mat_other.c
new file mode 100644
index 0000000000..a410ae3d8a
--- /dev/null
+++ b/src/gr_mat/set_gr_mat_other.c
@@ -0,0 +1,34 @@
+/*
+    Copyright (C) 2022 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "gr_vec.h"
+#include "gr_mat.h"
+
+int
+gr_mat_set_gr_mat_other(gr_mat_t res, const gr_mat_t mat, gr_ctx_t mat_ctx, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    slong mat_sz = mat_ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+
+    R = gr_mat_nrows(mat, mat_ctx);
+    C = gr_mat_ncols(mat, mat_ctx);
+
+    if (R != gr_mat_nrows(res, ctx) || C != gr_mat_ncols(res, ctx))
+        return GR_DOMAIN;
+
+    for (i = 0; i < R; i++)
+        for (j = 0; j < C && status == GR_SUCCESS; j++)
+            status |= gr_set_other(GR_MAT_ENTRY(res, i, j, sz), GR_MAT_ENTRY(mat, i, j, mat_sz), mat_ctx, ctx);
+
+    return status;
+}
\ No newline at end of file

From ed376545fa621e999734f3ae216cb6c13ce56071 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Sat, 13 Jul 2024 15:02:34 +0200
Subject: [PATCH 2/9] add some generic entrywise functions for gr_mat

---
 doc/source/gr_mat.rst  |  27 +++++++++
 src/gr_mat.h           |   9 +++
 src/gr_mat/entrywise.c | 121 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 157 insertions(+)
 create mode 100644 src/gr_mat/entrywise.c

diff --git a/doc/source/gr_mat.rst b/doc/source/gr_mat.rst
index e4a039cdfd..efb90c0979 100644
--- a/doc/source/gr_mat.rst
+++ b/doc/source/gr_mat.rst
@@ -214,6 +214,33 @@ Basic row, column and entry operations
     This predicate is always decidable (even if the underlying ring
     is not computable), returning ``T_TRUE`` or ``T_FALSE``.
 
+Entrywise operations
+-------------------------------------------------------------------------------
+
+.. function:: int gr_mat_entrywise_unary_op(gr_mat_t res, gr_method_unary_op f, const gr_mat_t mat, gr_ctx_t ctx)
+
+    Sets *res* to the application of the function *f* to the
+    entries of matrix *mat*. Returns ``GR_DOMAIN`` if the matrix dimensions do not match.
+
+.. function:: int gr_mat_entrywise_binary_op(gr_mat_t res, gr_method_binary_op f, const gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx)
+
+    Sets *res* to the application of the function *f*
+    to the entries of *mat1* as first argument and the entries of *mat2*
+    as second argument.
+    Returns ``GR_DOMAIN`` if the matrix dimensions do not match.
+
+.. function:: int gr_mat_entrywise_binary_op_scalar(gr_mat_t res, gr_method_binary_op f, const gr_mat_t mat, gr_srcptr c, gr_ctx_t ctx)
+
+    Sets *res* to the application of the function *f*
+    to the entries of *mat* as first argument and the scalar *c*
+    as second argument.
+    Returns ``GR_DOMAIN`` if the matrix dimensions do not match.
+
+.. function:: truth_t gr_mat_entrywise_unary_predicate_all(gr_method_unary_predicate f, const gr_mat_t mat, gr_ctx_t ctx)
+              truth_t gr_mat_entrywise_unary_predicate_any(gr_method_unary_predicate f, const gr_mat_t mat, gr_ctx_t ctx)
+
+    Returns whether the predicate *f* is true for all entries,
+    respectively for any entry, in the matrix *mat*.
 
 Arithmetic
 -------------------------------------------------------------------------------
diff --git a/src/gr_mat.h b/src/gr_mat.h
index fdcd7f7f61..ff1715bf03 100644
--- a/src/gr_mat.h
+++ b/src/gr_mat.h
@@ -120,6 +120,15 @@ WARN_UNUSED_RESULT int gr_mat_set_fmpz_mat(gr_mat_t res, const fmpz_mat_t mat, g
 WARN_UNUSED_RESULT int gr_mat_set_fmpq_mat(gr_mat_t res, const fmpq_mat_t mat, gr_ctx_t ctx);
 WARN_UNUSED_RESULT int gr_mat_set_gr_mat_other(gr_mat_t res, const gr_mat_t mat, gr_ctx_t mat_ctx, gr_ctx_t ctx);
 
+/* fixme: needed for method typedefs */
+#ifdef GR_H
+WARN_UNUSED_RESULT int gr_mat_entrywise_unary_op(gr_mat_t res, gr_method_unary_op f, const gr_mat_t mat, gr_ctx_t ctx);
+WARN_UNUSED_RESULT int gr_mat_entrywise_binary_op(gr_mat_t res, gr_method_binary_op f, const gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx);
+WARN_UNUSED_RESULT int gr_mat_entrywise_binary_op_scalar(gr_mat_t res, gr_method_binary_op f, const gr_mat_t mat, gr_srcptr c, gr_ctx_t ctx);
+truth_t gr_mat_entrywise_unary_predicate_all(gr_method_unary_predicate f, const gr_mat_t mat, gr_ctx_t ctx);
+truth_t gr_mat_entrywise_unary_predicate_any(gr_method_unary_predicate f, const gr_mat_t mat, gr_ctx_t ctx);
+#endif
+
 WARN_UNUSED_RESULT int gr_mat_neg(gr_mat_t res, const gr_mat_t mat, gr_ctx_t ctx);
 WARN_UNUSED_RESULT int gr_mat_swap_entrywise(gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx);
 WARN_UNUSED_RESULT int gr_mat_add(gr_mat_t res, const gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx);
diff --git a/src/gr_mat/entrywise.c b/src/gr_mat/entrywise.c
new file mode 100644
index 0000000000..4519d6a4d1
--- /dev/null
+++ b/src/gr_mat/entrywise.c
@@ -0,0 +1,121 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "gr.h"
+#include "gr_mat.h"
+
+int
+gr_mat_entrywise_unary_op(gr_mat_t res, gr_method_unary_op f, const gr_mat_t mat, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+
+    R = gr_mat_nrows(mat, ctx);
+    C = gr_mat_ncols(mat, ctx);
+
+    if (R != gr_mat_nrows(res, ctx) || C != gr_mat_ncols(res, ctx))
+        return GR_DOMAIN;
+
+    for (i = 0; i < R; i++)
+        for (j = 0; j < C; j++)
+            status |= f(GR_MAT_ENTRY(res, i, j, sz), GR_MAT_ENTRY(mat, i, j, sz), ctx);
+
+    return status;
+}
+
+int
+gr_mat_entrywise_binary_op(gr_mat_t res, gr_method_binary_op f, const gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+
+    R = gr_mat_nrows(mat1, ctx);
+    C = gr_mat_ncols(mat1, ctx);
+
+    if (R != gr_mat_nrows(res, ctx) || C != gr_mat_ncols(res, ctx) || R != gr_mat_nrows(mat2, ctx) || C != gr_mat_ncols(mat2, ctx))
+        return GR_DOMAIN;
+
+    for (i = 0; i < R; i++)
+        for (j = 0; j < C; j++)
+            status |= f(GR_MAT_ENTRY(res, i, j, sz), GR_MAT_ENTRY(mat1, i, j, sz), GR_MAT_ENTRY(mat2, i, j, sz), ctx);
+
+    return status;
+}
+
+int
+gr_mat_entrywise_binary_op_scalar(gr_mat_t res, gr_method_binary_op f, const gr_mat_t mat, gr_srcptr c, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+
+    R = gr_mat_nrows(mat, ctx);
+    C = gr_mat_ncols(mat, ctx);
+
+    if (R != gr_mat_nrows(res, ctx) || C != gr_mat_ncols(res, ctx))
+        return GR_DOMAIN;
+
+    for (i = 0; i < R; i++)
+        for (j = 0; j < C; j++)
+            status |= f(GR_MAT_ENTRY(res, i, j, sz), GR_MAT_ENTRY(mat, i, j, sz), c, ctx);
+
+    return status;
+}
+
+truth_t
+gr_mat_entrywise_unary_predicate_all(gr_method_unary_predicate f, const gr_mat_t mat, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    truth_t val, ans = T_TRUE;
+
+    R = gr_mat_nrows(mat, ctx);
+    C = gr_mat_ncols(mat, ctx);
+
+    for (i = 0; i < R; i++)
+    {
+        for (j = 0; j < C; j++)
+        {
+            val = f(GR_MAT_ENTRY(mat, i, j, sz), ctx);
+            if (val == T_FALSE)
+                return T_FALSE;
+            ans = truth_and(ans, val);
+        }
+    }
+
+    return ans;
+}
+
+truth_t
+gr_mat_entrywise_unary_predicate_any(gr_method_unary_predicate f, const gr_mat_t mat, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    truth_t val, ans = T_FALSE;
+
+    R = gr_mat_nrows(mat, ctx);
+    C = gr_mat_ncols(mat, ctx);
+
+    for (i = 0; i < R; i++)
+    {
+        for (j = 0; j < C; j++)
+        {
+            val = f(GR_MAT_ENTRY(mat, i, j, sz), ctx);
+            if (val == T_TRUE)
+                return T_TRUE;
+            ans = truth_or(ans, val);
+        }
+    }
+
+    return ans;
+}

From 666eb6ae57ca0ed336a44548615c98a9a185edcc Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Sat, 13 Jul 2024 16:48:11 +0200
Subject: [PATCH 3/9] wip matrix mul for nfloat

---
 doc/source/gr.rst            |  13 +
 doc/source/gr_mat.rst        |  13 +
 src/gr.h                     |  16 +
 src/gr_mat.h                 |   2 +
 src/gr_mat/entrywise.c       |  27 +
 src/gr_mat/randtest.c        |  17 +-
 src/gr_mat/test_approx_mul.c | 119 +++++
 src/nfloat.h                 |   5 +
 src/nfloat/mat_mul.c         | 926 +++++++++++++++++++++++++++++++++++
 src/nfloat/nfloat.c          |   2 +-
 src/nfloat/test/main.c       |   2 +
 src/nfloat/test/t-mat_mul.c  |  67 +++
 12 files changed, 1203 insertions(+), 6 deletions(-)
 create mode 100644 src/gr_mat/test_approx_mul.c
 create mode 100644 src/nfloat/mat_mul.c
 create mode 100644 src/nfloat/test/t-mat_mul.c

diff --git a/doc/source/gr.rst b/doc/source/gr.rst
index 15183b3195..c535e86b20 100644
--- a/doc/source/gr.rst
+++ b/doc/source/gr.rst
@@ -844,6 +844,19 @@ Ordering methods
     of *x* is less than, equal or greater than the absolute value of *y*.
     This may return ``GR_DOMAIN`` if the ring is not an ordered ring.
 
+.. function:: truth_t gr_le(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+              truth_t gr_lt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+              truth_t gr_ge(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+              truth_t gr_gt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+              truth_t gr_abs_le(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+              truth_t gr_abs_lt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+              truth_t gr_abs_ge(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+              truth_t gr_abs_gt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+
+    Wrappers of ``gr_cmp`` and ``gr_cmpabs`` returning truth values
+    for the comparison operations ``<=``, ``<``, ``>=``, ``>``.
+
+
 Enclosure and interval methods
 ........................................................................
 
diff --git a/doc/source/gr_mat.rst b/doc/source/gr_mat.rst
index efb90c0979..68580ef8d7 100644
--- a/doc/source/gr_mat.rst
+++ b/doc/source/gr_mat.rst
@@ -242,6 +242,13 @@ Entrywise operations
     Returns whether the predicate *f* is true for all entries,
     respectively for any entry, in the matrix *mat*.
 
+.. function:: truth_t gr_mat_entrywise_binary_predicate_all(gr_method_binary_predicate f, const gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx)
+
+    Returns whether the binary predicate *f* is true for all entries
+    in *mat1* paired with the corresponding entries in *mat2*.
+    Returns ``T_FALSE`` if the matrix dimensions are not compatible.
+
+
 Arithmetic
 -------------------------------------------------------------------------------
 
@@ -830,6 +837,12 @@ on each test iteration, otherwise the given ring is tested.
     Tests the given function ``solve_impl`` for correctness as an implementation
     of :func:`gr_mat_nonsingular_solve_tril` / :func:`gr_mat_nonsingular_solve_triu`.
 
+.. function:: void gr_mat_test_approx_mul_pos_entrywise_accurate(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx)
+
+    Tests the given implementation of matrix multiplication for accuracy
+    over an approximate numerical ring by generating nonnegative matrices
+    and checking that the entrywise relative error compared to
+    classical multiplication does not exceed *rel_tol*.
 
 .. raw:: latex
 
diff --git a/src/gr.h b/src/gr.h
index 728366c7ac..347ab28a8f 100644
--- a/src/gr.h
+++ b/src/gr.h
@@ -1109,6 +1109,22 @@ GR_INLINE WARN_UNUSED_RESULT int gr_cmpabs(int * res, gr_srcptr x, gr_srcptr y,
 GR_INLINE WARN_UNUSED_RESULT int gr_cmp_other(int * res, gr_srcptr x, gr_srcptr y, gr_ctx_t y_ctx, gr_ctx_t ctx) { return GR_BINARY_OP_OTHER_GET_INT(ctx, CMP_OTHER)(res, x, y, y_ctx, ctx); }
 GR_INLINE WARN_UNUSED_RESULT int gr_cmpabs_other(int * res, gr_srcptr x, gr_srcptr y, gr_ctx_t y_ctx, gr_ctx_t ctx) { return GR_BINARY_OP_OTHER_GET_INT(ctx, CMPABS_OTHER)(res, x, y, y_ctx, ctx); }
 
+#define __GR_CMP(cfun, expr) \
+    int cmp; \
+    if ((cfun)(&cmp, x, y, ctx) != GR_SUCCESS) \
+        return T_UNKNOWN; \
+    return (expr) ? T_TRUE : T_FALSE; \
+
+GR_INLINE WARN_UNUSED_RESULT truth_t gr_le(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmp, cmp <= 0) }
+GR_INLINE WARN_UNUSED_RESULT truth_t gr_lt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmp, cmp < 0) }
+GR_INLINE WARN_UNUSED_RESULT truth_t gr_ge(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmp, cmp >= 0) }
+GR_INLINE WARN_UNUSED_RESULT truth_t gr_gt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmp, cmp > 0) }
+
+GR_INLINE WARN_UNUSED_RESULT truth_t gr_abs_le(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmpabs, cmp <= 0) }
+GR_INLINE WARN_UNUSED_RESULT truth_t gr_abs_lt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmpabs, cmp < 0) }
+GR_INLINE WARN_UNUSED_RESULT truth_t gr_abs_ge(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmpabs, cmp >= 0) }
+GR_INLINE WARN_UNUSED_RESULT truth_t gr_abs_gt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmpabs, cmp > 0) }
+
 GR_INLINE WARN_UNUSED_RESULT int gr_gen(gr_ptr res, gr_ctx_t ctx) { return GR_CONSTANT_OP(ctx, GEN)(res, ctx); }
 GR_INLINE WARN_UNUSED_RESULT int gr_gens(gr_vec_t res, gr_ctx_t ctx) { return GR_VEC_CTX_OP(ctx, GENS)(res, ctx); }
 GR_INLINE WARN_UNUSED_RESULT int gr_gens_recursive(gr_vec_t res, gr_ctx_t ctx) { return GR_VEC_CTX_OP(ctx, GENS_RECURSIVE)(res, ctx); }
diff --git a/src/gr_mat.h b/src/gr_mat.h
index ff1715bf03..4f5a15af1a 100644
--- a/src/gr_mat.h
+++ b/src/gr_mat.h
@@ -127,6 +127,7 @@ WARN_UNUSED_RESULT int gr_mat_entrywise_binary_op(gr_mat_t res, gr_method_binary
 WARN_UNUSED_RESULT int gr_mat_entrywise_binary_op_scalar(gr_mat_t res, gr_method_binary_op f, const gr_mat_t mat, gr_srcptr c, gr_ctx_t ctx);
 truth_t gr_mat_entrywise_unary_predicate_all(gr_method_unary_predicate f, const gr_mat_t mat, gr_ctx_t ctx);
 truth_t gr_mat_entrywise_unary_predicate_any(gr_method_unary_predicate f, const gr_mat_t mat, gr_ctx_t ctx);
+truth_t gr_mat_entrywise_binary_predicate_all(gr_method_binary_predicate f, const gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx);
 #endif
 
 WARN_UNUSED_RESULT int gr_mat_neg(gr_mat_t res, const gr_mat_t mat, gr_ctx_t ctx);
@@ -296,6 +297,7 @@ void gr_mat_test_lu(gr_method_mat_lu_op lu_impl, flint_rand_t state, slong iters
 void gr_mat_test_det(gr_method_mat_unary_op_get_scalar det_impl, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
 void gr_mat_test_nonsingular_solve_tril(gr_method_mat_binary_op_with_flag solve_impl, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
 void gr_mat_test_nonsingular_solve_triu(gr_method_mat_binary_op_with_flag solve_impl, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
+void gr_mat_test_approx_mul_pos_entrywise_accurate(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
 
 #ifdef __cplusplus
 }
diff --git a/src/gr_mat/entrywise.c b/src/gr_mat/entrywise.c
index 4519d6a4d1..d5d66be260 100644
--- a/src/gr_mat/entrywise.c
+++ b/src/gr_mat/entrywise.c
@@ -119,3 +119,30 @@ gr_mat_entrywise_unary_predicate_any(gr_method_unary_predicate f, const gr_mat_t
 
     return ans;
 }
+
+truth_t
+gr_mat_entrywise_binary_predicate_all(gr_method_binary_predicate f, const gr_mat_t mat1, const gr_mat_t mat2, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    truth_t val, ans = T_TRUE;
+
+    R = gr_mat_nrows(mat1, ctx);
+    C = gr_mat_ncols(mat1, ctx);
+
+    if (R != gr_mat_nrows(mat2, ctx) || C != gr_mat_ncols(mat2, ctx))
+        return T_FALSE;
+
+    for (i = 0; i < R; i++)
+    {
+        for (j = 0; j < C; j++)
+        {
+            val = f(GR_MAT_ENTRY(mat1, i, j, sz), GR_MAT_ENTRY(mat2, i, j, sz), ctx);
+            if (val == T_FALSE)
+                return T_FALSE;
+            ans = truth_and(ans, val);
+        }
+    }
+
+    return ans;
+}
diff --git a/src/gr_mat/randtest.c b/src/gr_mat/randtest.c
index ce30c13ad9..e4a820de4e 100644
--- a/src/gr_mat/randtest.c
+++ b/src/gr_mat/randtest.c
@@ -16,16 +16,23 @@
 int
 gr_mat_randtest(gr_mat_t mat, flint_rand_t state, gr_ctx_t ctx)
 {
-    int status;
-    slong i, r, c;
+    int status = GR_SUCCESS;
+    slong i, j, r, c;
+    slong sz = ctx->sizeof_elem;
 
     r = gr_mat_nrows(mat, ctx);
     c = gr_mat_ncols(mat, ctx);
 
-    status = GR_SUCCESS;
-    for (i = 0; i < r; i++)
+    if (n_randint(state, 10) == 0)
     {
-        status |= _gr_vec_randtest(mat->rows[i], state, c, ctx);
+        for (i = 0; i < r; i++)
+            for (j = 0; j < c; j++)
+                status |= gr_randtest(GR_MAT_ENTRY(mat, i, j, sz), state, ctx);
+    }
+    else
+    {
+        for (i = 0; i < r; i++)
+            status |= _gr_vec_randtest(mat->rows[i], state, c, ctx);
     }
 
     return status;
diff --git a/src/gr_mat/test_approx_mul.c b/src/gr_mat/test_approx_mul.c
new file mode 100644
index 0000000000..a491254a04
--- /dev/null
+++ b/src/gr_mat/test_approx_mul.c
@@ -0,0 +1,119 @@
+/*
+    Copyright (C) 2022, 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you grn redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "gr.h"
+#include "gr_mat.h"
+
+void gr_mat_test_approx_mul_pos_entrywise_accurate(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx)
+{
+    slong iter;
+    gr_ctx_ptr given_ctx = ctx;
+
+    for (iter = 0; iter < iters; iter++)
+    {
+        gr_mat_t A, B, C, D, ERR, TOL;
+        slong a, b, c;
+        int status = GR_SUCCESS;
+        gr_ctx_t my_ctx;
+        gr_ctx_ptr ctx;
+
+        if (given_ctx == NULL)
+        {
+            gr_ctx_init_random(my_ctx, state);
+            ctx = my_ctx;
+        }
+        else
+            ctx = given_ctx;
+
+        if (n_randint(state, 4) == 0)
+        {
+            a = b = c = n_randint(state, maxn);
+        }
+        else
+        {
+            a = n_randint(state, maxn);
+            b = n_randint(state, maxn);
+            c = n_randint(state, maxn);
+        }
+
+        gr_mat_init(A, a, b, ctx);
+        gr_mat_init(B, b, c, ctx);
+        gr_mat_init(C, a, c, ctx);
+        gr_mat_init(D, a, c, ctx);
+        gr_mat_init(ERR, a, c, ctx);
+        gr_mat_init(TOL, a, c, ctx);
+
+        status |= gr_mat_randtest(A, state, ctx);
+        status |= gr_mat_randtest(B, state, ctx);
+        status |= gr_mat_entrywise_unary_op(A, (gr_method_unary_op) gr_abs, A, ctx);
+        status |= gr_mat_entrywise_unary_op(B, (gr_method_unary_op) gr_abs, B, ctx);
+
+        status |= gr_mat_randtest(C, state, ctx);
+        status |= gr_mat_randtest(D, state, ctx);
+
+        if (b == c && n_randint(state, 2))
+        {
+            status |= gr_mat_set(C, A, ctx);
+            status |= mul_impl(C, C, B, ctx);
+        }
+        else if (a == b && n_randint(state, 2))
+        {
+            status |= gr_mat_set(C, B, ctx);
+            status |= mul_impl(C, A, C, ctx);
+        }
+        else if (a == b && b == c && n_randint(state, 2))
+        {
+            status |= gr_mat_set(B, A, ctx);
+            status |= mul_impl(C, A, A, ctx);
+        }
+        else if (a == b && b == c && n_randint(state, 2))
+        {
+            status |= gr_mat_set(B, A, ctx);
+            status |= gr_mat_set(C, A, ctx);
+            status |= mul_impl(C, C, C, ctx);
+        }
+        else
+        {
+            status |= mul_impl(C, A, B, ctx);
+        }
+
+        status |= gr_mat_mul_classical(D, A, B, ctx);
+
+        /* |C-D| <= |D| tol */
+        status |= gr_mat_sub(ERR, C, D, ctx);
+        status |= gr_mat_entrywise_unary_op(ERR, (gr_method_unary_op) gr_abs, ERR, ctx);
+        status |= gr_mat_entrywise_unary_op(TOL, (gr_method_unary_op) gr_abs, D, ctx);
+        status |= gr_mat_mul_scalar(TOL, TOL, rel_tol, ctx);
+
+        if (status == GR_SUCCESS && gr_mat_entrywise_binary_predicate_all((gr_method_binary_predicate) gr_le, ERR, TOL, ctx) == T_FALSE)
+        {
+            flint_printf("FAIL:\n");
+            gr_ctx_println(ctx);
+            flint_printf("A:\n"); gr_mat_print(A, ctx); flint_printf("\n\n");
+            flint_printf("B:\n"); gr_mat_print(B, ctx); flint_printf("\n\n");
+            flint_printf("C:\n"); gr_mat_print(C, ctx); flint_printf("\n\n");
+            flint_printf("D:\n"); gr_mat_print(D, ctx); flint_printf("\n\n");
+            flint_printf("ERR:\n"); gr_mat_print(ERR, ctx); flint_printf("\n\n");
+            flint_printf("TOL:\n"); gr_mat_print(TOL, ctx); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        gr_mat_clear(A, ctx);
+        gr_mat_clear(B, ctx);
+        gr_mat_clear(C, ctx);
+        gr_mat_clear(D, ctx);
+        gr_mat_clear(ERR, ctx);
+        gr_mat_clear(TOL, ctx);
+
+        if (given_ctx == NULL)
+            gr_ctx_clear(ctx);
+    }
+}
diff --git a/src/nfloat.h b/src/nfloat.h
index 45a16d3eee..4a5d61497a 100644
--- a/src/nfloat.h
+++ b/src/nfloat.h
@@ -453,6 +453,11 @@ int _nfloat_vec_submul_scalar(nfloat_ptr res, nfloat_srcptr x, slong len, nfloat
 int _nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx);
 int _nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx);
 
+int _nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
+int _nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
+int nfloat_mat_mul_block(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, slong min_block_size, gr_ctx_t ctx);
+int nfloat_mat_mul(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
+
 /* Complex numbers */
 /* Note: we use the same context data for real and complex rings
    (only which_ring and sizeof_elem differ). This allows us to call
diff --git a/src/nfloat/mat_mul.c b/src/nfloat/mat_mul.c
new file mode 100644
index 0000000000..b943a4c33d
--- /dev/null
+++ b/src/nfloat/mat_mul.c
@@ -0,0 +1,926 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "mpn_extras.h"
+#include "gr.h"
+#include "gr_mat.h"
+#include "gr_generic.h"
+#include "acf.h"
+#include "acb.h"
+#include "nfloat.h"
+
+#include "gr.h"
+#include "nfloat.h"
+#include "gr_vec.h"
+#include "gr_mat.h"
+#include "gr_special.h"
+#include "fmpz_mat.h"
+
+
+int
+nfloat_mat_mul(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
+
+
+/* For printing */
+#include "arf.h"
+
+/* Arithmetic on fixed-point numbers in (-1,1) */
+/* x[0] stores the sign bit, x[1], ..., x[n] store the absolute value */
+
+void
+nfixed_print(nn_srcptr x, slong nlimbs, slong exp)
+{
+    arf_t t;
+    arf_init(t);
+    _arf_set_mpn_fixed(t, x + 1, nlimbs, nlimbs, x[0], nlimbs * FLINT_BITS, ARF_RND_DOWN);
+    arf_mul_2exp_si(t, t, exp);
+    arf_printd(t, nlimbs * FLINT_BITS / 3.321928 + 1);
+    arf_clear(t);
+}
+
+
+/* todo: don't do this */
+#define NFIXED_MAX_NLIMBS (2 * NFLOAT_MAX_LIMBS)
+
+FLINT_FORCE_INLINE
+void nfixed_add(nn_ptr res, nn_srcptr a, nn_srcptr b, slong nlimbs)
+{
+    int asgn, bsgn;
+    asgn = a[0];
+    bsgn = b[0];
+
+    if (asgn == bsgn)
+    {
+        res[0] = asgn;
+        mpn_add_n(res + 1, a + 1, b + 1, nlimbs);
+    }
+    else
+    {
+        res[0] = asgn ^ flint_mpn_signed_sub_n(res + 1, a + 1, b + 1, nlimbs);
+    }
+}
+
+FLINT_FORCE_INLINE
+void nfixed_sub(nn_ptr res, nn_srcptr a, nn_srcptr b, slong nlimbs)
+{
+    int asgn, bsgn;
+    asgn = a[0];
+    bsgn = b[0];
+
+    if (asgn != bsgn)
+    {
+        res[0] = asgn;
+        mpn_add_n(res + 1, a + 1, b + 1, nlimbs);
+    }
+    else
+    {
+        res[0] = asgn ^ flint_mpn_signed_sub_n(res + 1, a + 1, b + 1, nlimbs);
+    }
+}
+
+FLINT_FORCE_INLINE
+void nfixed_mul(nn_ptr res, nn_srcptr a, nn_srcptr b, slong nlimbs)
+{
+    int asgn, bsgn;
+    asgn = a[0];
+    bsgn = b[0];
+    res[0] = asgn ^ bsgn;
+    flint_mpn_mulhigh_n(res + 1, a + 1, b + 1, nlimbs);
+}
+
+FLINT_FORCE_INLINE
+void nfixed_sqr(nn_ptr res, nn_srcptr a, slong nlimbs)
+{
+    res[0] = 0;
+    flint_mpn_sqrhigh(res + 1, a + 1, nlimbs);
+}
+
+FLINT_FORCE_INLINE
+void nfixed_div2(nn_ptr res, nn_srcptr a, slong nlimbs)
+{
+    res[0] = a[0];
+    mpn_rshift(res + 1, a + 1, nlimbs, 1);
+}
+
+/* A is (m x n), B is (n x p), C is (m x p) */
+void
+_nfixed_mat_mul_classical(nn_ptr C, nn_srcptr A, nn_srcptr B, slong m, slong n, slong p, slong nlimbs)
+{
+    slong i, j, k;
+    nn_ptr t;
+    TMP_INIT;
+
+    TMP_START;
+
+    t = TMP_ALLOC((nlimbs + 1) * sizeof(ulong));
+
+#define A_ENTRY(i, j) ((A) + ((i) * n + (j)) * (nlimbs + 1))
+#define B_ENTRY(i, j) ((B) + ((i) * p + (j)) * (nlimbs + 1))
+#define C_ENTRY(i, j) ((C) + ((i) * p + (j)) * (nlimbs + 1))
+
+    for (i = 0; i < m; i++)
+    {
+        for (j = 0; j < p; j++)
+        {
+            nfixed_mul(C_ENTRY(i, j), A_ENTRY(i, 0), B_ENTRY(0, j), nlimbs);
+
+            for (k = 1; k < n; k++)
+            {
+                nfixed_mul(t, A_ENTRY(i, k), B_ENTRY(k, j), nlimbs);
+                nfixed_add(C_ENTRY(i, j), C_ENTRY(i, j), t, nlimbs);
+            }
+        }
+    }
+
+    TMP_END;
+
+#undef A_ENTRY
+#undef B_ENTRY
+#undef C_ENTRY
+}
+
+/* compute c += (a1 + b1) * (a2 + b2) */
+/* val0, val1, val2 are scratch space */
+FLINT_FORCE_INLINE void
+addmul_addadd(nn_ptr val0, nn_ptr val1, nn_ptr val2, nn_ptr c, nn_srcptr a1, nn_srcptr b1, nn_srcptr a2, nn_srcptr b2, slong nlimbs)
+{
+    nfixed_add(val1, a1, b1, nlimbs);
+    nfixed_add(val2, a2, b2, nlimbs);
+    nfixed_mul(val0, val1, val2, nlimbs);
+    nfixed_add(c, c, val0, nlimbs);
+}
+
+/* compute c += (a1 - b1) * (a2 - b2) */
+/* val0, val1, val2 are scratch space */
+FLINT_FORCE_INLINE void
+addmul_subsub(nn_ptr val0, nn_ptr val1, nn_ptr val2, nn_ptr c, nn_srcptr a1, nn_srcptr b1, nn_srcptr a2, nn_srcptr b2, slong nlimbs)
+{
+    nfixed_sub(val1, a1, b1, nlimbs);
+    nfixed_sub(val2, a2, b2, nlimbs);
+    nfixed_mul(val0, val1, val2, nlimbs);
+    nfixed_add(c, c, val0, nlimbs);
+}
+
+void
+_nfixed_mat_mul_waksman(nn_ptr C, nn_srcptr A, nn_srcptr B, slong m, slong n, slong p, slong nlimbs)
+{
+    slong l, j, k;
+
+    nn_ptr Ctmp = flint_calloc((nlimbs + 1) * ((p + m) + 5), sizeof(ulong));
+                                            /* Ctmp itself has m * p entries */
+    nn_ptr Crow = Ctmp;                     /* Crow has p entries */
+    nn_ptr Ccol = Crow + (nlimbs + 1) * p;  /* Ccol has m entries */
+    nn_ptr val0 = Ccol + (nlimbs + 1) * m;  /* val0 has room for 2 sums */
+    nn_ptr val1 = val0 + (nlimbs + 1) * 2;  /* val1 has room for 1 sum   */
+    nn_ptr val2 = val1 + (nlimbs + 1);      /* val2 has room for 1 sum   */
+    nn_ptr crow = val2 + (nlimbs + 1);      /* crow has room for 1 sum   */
+
+#define A_ENTRY(i, j) ((A) + ((i) * n + (j)) * (nlimbs + 1))
+#define B_ENTRY(i, j) ((B) + ((i) * p + (j)) * (nlimbs + 1))
+#define C_ENTRY(i, j) ((C) + ((i) * p + (j)) * (nlimbs + 1))
+
+#define Crow_ENTRY(ii) (Crow + (ii) * (nlimbs + 1))
+#define Ccol_ENTRY(ii) (Ccol + (ii) * (nlimbs + 1))
+
+    slong np = n >> 1;
+
+    for (j = 1; j <= np; j++)
+    {
+        slong j2 = (j << 1) - 1;
+
+        for (k = 0; k < p; k++)
+        {
+            addmul_addadd(val0, val1, val2, C_ENTRY(0, k), A_ENTRY(0, j2-1), B_ENTRY(j2, k), A_ENTRY(0, j2), B_ENTRY(j2-1, k), nlimbs);
+            addmul_subsub(val0, val1, val2, Crow_ENTRY(k), A_ENTRY(0, j2-1), B_ENTRY(j2, k), A_ENTRY(0, j2), B_ENTRY(j2-1, k), nlimbs);
+        }
+
+        for (l = 1; l < m; l++)
+        {
+            addmul_addadd(val0, val1, val2, C_ENTRY(l, 0), A_ENTRY(l, j2-1), B_ENTRY(j2, 0), A_ENTRY(l, j2), B_ENTRY(j2-1, 0), nlimbs);
+            addmul_subsub(val0, val1, val2, Ccol_ENTRY(l), A_ENTRY(l, j2-1), B_ENTRY(j2, 0), A_ENTRY(l, j2), B_ENTRY(j2-1, 0), nlimbs);
+        }
+
+        for (k = 1; k < p; k++)
+        {
+            for (l = 1; l < m; l++)
+            {
+                addmul_addadd(val0, val1, val2, C_ENTRY(l, k), A_ENTRY(l, j2-1), B_ENTRY(j2, k), A_ENTRY(l, j2), B_ENTRY(j2-1, k), nlimbs);
+            }
+        }
+    }
+
+    for (l = 1; l < m; l++)
+    {
+        nfixed_add(val1, Ccol_ENTRY(l), C_ENTRY(l, 0), nlimbs);
+        nfixed_div2(Ccol_ENTRY(l), val1, nlimbs);
+        nfixed_sub(C_ENTRY(l, 0), C_ENTRY(l, 0), Ccol_ENTRY(l), nlimbs);
+    }
+
+    nfixed_add(val1, Crow, C_ENTRY(0, 0), nlimbs);
+    nfixed_div2(val0, val1, nlimbs);
+    nfixed_sub(C_ENTRY(0, 0), C_ENTRY(0, 0), val0, nlimbs);
+
+    for (k = 1; k < p; k++)
+    {
+        nfixed_add(crow, Crow_ENTRY(k), C_ENTRY(0, k), nlimbs);
+        nfixed_div2(val1, crow, nlimbs);
+        nfixed_sub(C_ENTRY(0, k), C_ENTRY(0, k), val1, nlimbs);
+        nfixed_sub(crow, val1, val0, nlimbs);
+
+        for (l = 1; l < m; l++)
+        {
+            nfixed_sub(val2, C_ENTRY(l, k), crow, nlimbs);
+            nfixed_sub(C_ENTRY(l, k), val2, Ccol_ENTRY(l), nlimbs);
+        }
+    }
+
+    if ((n & 1) == 1)
+    {
+        for (l = 0; l < m; l++)
+        {
+            for (k = 0; k < p; k++)
+            {
+                nfixed_mul(val0, A_ENTRY(l, n-1), B_ENTRY(n-1, k), nlimbs);
+                nfixed_add(C_ENTRY(l, k), C_ENTRY(l, k), val0, nlimbs);
+            }
+        }
+    }
+
+    flint_free(Ctmp);
+
+#undef A_ENTRY
+#undef B_ENTRY
+#undef C_ENTRY
+}
+
+FLINT_FORCE_INLINE void
+_nfloat_get_nfixed(nn_ptr res, nn_srcptr x, slong exp, slong fix_nlimbs, gr_ctx_t ctx)
+{
+    slong rel_exp;
+
+    /* assumes res is already zeroed */
+    if (NFLOAT_IS_ZERO(x))
+        return;
+
+    rel_exp = NFLOAT_EXP(x) - exp;
+    if (rel_exp >= 0)
+        flint_abort();
+
+    res[0] = NFLOAT_SGNBIT(x);
+    _arf_get_integer_mpn(res + 1, NFLOAT_D(x), NFLOAT_CTX_NLIMBS(ctx), fix_nlimbs * FLINT_BITS + rel_exp);
+}
+
+FLINT_FORCE_INLINE int
+_nfloat_set_nfixed(nn_ptr res, nn_srcptr x, slong exp, slong fix_nlimbs, gr_ctx_t ctx)
+{
+    return nfloat_set_mpn_2exp(res, x + 1, fix_nlimbs, exp, x[0], ctx);
+}
+
+static void
+_nfloat_mat_exp_range(slong * _Amin, slong * _Amax, const gr_mat_t A, gr_ctx_t ctx)
+{
+    slong Amax, Amin;
+    slong m = A->r;
+    slong n = A->c;
+    slong exp, i, j;
+    slong sz = ctx->sizeof_elem;
+
+    Amax = WORD_MIN;
+    Amin = WORD_MAX;
+
+    for (i = 0; i < m; i++)
+    {
+        for (j = 0; j < n; j++)
+        {
+            exp = NFLOAT_EXP(GR_MAT_ENTRY(A, i, j, sz));
+            Amax = FLINT_MAX(Amax, exp);
+            Amin = FLINT_MIN(Amin, exp);
+        }
+    }
+
+    _Amin[0] = Amin;
+    _Amax[0] = Amax;
+}
+
+int
+_nfloat_mat_mul_fixed_given_exp(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, slong Aexp, slong Bexp, slong fnlimbs, int waksman, gr_ctx_t ctx)
+{
+    nn_ptr T, TA, TB, TC;
+    slong i, j;
+    slong sz = ctx->sizeof_elem;
+    slong fdnlimbs;
+
+    slong m = A->r;
+    slong n = A->c;
+    slong p = B->c;
+
+    /* limbs including sign limb */
+    fdnlimbs = fnlimbs + 1;
+
+    T = flint_calloc(fdnlimbs * (m * n + n * p + m * p), sizeof(ulong));
+
+    TA = T;
+    TB = TA + fdnlimbs * (m * n);
+    TC = TB + fdnlimbs * (n * p);
+
+    for (i = 0; i < m; i++)
+        for (j = 0; j < n; j++)
+            _nfloat_get_nfixed(TA + i * fdnlimbs * n + j * fdnlimbs, GR_MAT_ENTRY(A, i, j, sz), Aexp, fnlimbs, ctx);
+
+    for (i = 0; i < n; i++)
+        for (j = 0; j < p; j++)
+            _nfloat_get_nfixed(TB + i * fdnlimbs * p + j * fdnlimbs, GR_MAT_ENTRY(B, i, j, sz), Bexp, fnlimbs, ctx);
+
+    if (waksman)
+        _nfixed_mat_mul_waksman(TC, TA, TB, m, n, p, fnlimbs);
+    else
+        _nfixed_mat_mul_classical(TC, TA, TB, m, n, p, fnlimbs);
+
+    for (i = 0; i < m; i++)
+        for (j = 0; j < p; j++)
+            _nfloat_set_nfixed(GR_MAT_ENTRY(C, i, j, sz), TC + i * fdnlimbs * p + j * fdnlimbs, Aexp + Bexp, fnlimbs, ctx);
+
+    flint_free(T);
+
+    return GR_SUCCESS;
+}
+
+int
+_nfloat_mat_mul_fixed(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, int waksman, slong max_extra_bits, gr_ctx_t ctx)
+{
+    slong Amax, Amin, Bmax, Bmin, Adelta, Bdelta, Aexp, Bexp;
+    slong prec;
+    slong pad_top, pad_bot, extra_bits, fbits, fnlimbs;
+    slong n = A->c;
+
+    if (NFLOAT_CTX_HAS_INF_NAN(ctx))
+        return GR_UNABLE;
+
+    prec = NFLOAT_CTX_PREC(ctx);
+
+    _nfloat_mat_exp_range(&Amin, &Amax, A, ctx);
+    _nfloat_mat_exp_range(&Bmin, &Bmax, B, ctx);
+
+    if (Amax < NFLOAT_MIN_EXP || Bmax < NFLOAT_MIN_EXP)
+        return gr_mat_zero(C, ctx);
+
+    /* Currently, we don't handle zeros. (They pose no problem, but zero entries in
+       the output may not be exact. To be done.) */
+    if (Amin < NFLOAT_MIN_EXP || Bmin < NFLOAT_MIN_EXP)
+        return gr_mat_mul_classical(C, A, B, ctx);
+
+    Adelta = Amax - Amin;
+    Bdelta = Bmax - Bmin;
+
+    /* sanity check */
+    if (Adelta > 10 * prec || Bdelta > 10 * prec)
+        return gr_mat_mul_classical(C, A, B, ctx);
+
+    /*
+    To double check: for Waksman,
+        * The intermediate entries are bounded by 8n max(|A|,|B|)^2.
+        * The error, including error from converting
+          the input matrices, is bounded by 8n ulps.
+    */
+
+    pad_top = 3 + FLINT_BIT_COUNT(n);
+    pad_bot = 3 + FLINT_BIT_COUNT(n);
+
+    extra_bits = Adelta + Bdelta + pad_top + pad_bot;
+
+    if (extra_bits > max_extra_bits)
+        return gr_mat_mul_classical(C, A, B, ctx);
+
+    Aexp = Amax + pad_top;
+    Bexp = Bmax + pad_top;
+    fbits = prec + extra_bits;
+    fnlimbs = (fbits + FLINT_BITS - 1) / FLINT_BITS;
+
+    return _nfloat_mat_mul_fixed_given_exp(C, A, B, Aexp, Bexp, fnlimbs, waksman, ctx);
+}
+
+int
+_nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+{
+    return _nfloat_mat_mul_fixed(C, A, B, 0, 100000, ctx);
+}
+
+int
+_nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+{
+    return _nfloat_mat_mul_fixed(C, A, B, 1, 100000, ctx);
+}
+
+
+static void
+_nfloat_2exp_get_fmpz(fmpz_t res, nfloat_srcptr x, slong fixexp, gr_ctx_t ctx)
+{
+    slong exp, zn;
+    mpz_ptr zz;
+    nn_ptr zp;
+    int negative;
+    slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+
+    if (NFLOAT_IS_SPECIAL(x))
+    {
+        fmpz_zero(res);
+        return;
+    }
+
+    exp = NFLOAT_EXP(x) - fixexp;
+
+    if (exp <= 0)
+    {
+        fmpz_zero(res);
+        return;
+    }
+
+    /* todo: small case */
+
+    negative = NFLOAT_SGNBIT(x);
+
+    zn = (exp + FLINT_BITS - 1) / FLINT_BITS;
+    zz = _fmpz_promote(res);
+    zp = FLINT_MPZ_REALLOC(zz, zn);
+    _arf_get_integer_mpn(zp, NFLOAT_D(x), nlimbs, exp);
+    zz->_mp_size = negative ? -zn : zn;
+    _fmpz_demote_val(res);
+}
+
+int
+nfloat_mat_addmul_block_fallback(gr_mat_t C,
+    const gr_mat_t A, const gr_mat_t B,
+    slong block_start,
+    slong block_end,
+    gr_ctx_t ctx)
+{
+    slong M, P, n;
+    slong i, j, sz;
+    nn_ptr tmpB;
+    slong ndlimbs = NFLOAT_CTX_DATA_NLIMBS(ctx);
+    sz = ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+
+    M = A->r;
+    P = B->c;
+
+    n = block_end - block_start;
+
+    tmpB = flint_malloc(sizeof(ulong) * ndlimbs * (P * n));
+
+#define AA(ii, jj) GR_MAT_ENTRY(A, ii, block_start + (jj), sz)
+
+    for (i = 0; i < P; i++)
+        for (j = 0; j < n; j++)
+            flint_mpn_copyi(tmpB + (i * n + j) * ndlimbs, GR_MAT_ENTRY(B, block_start + j, i, sz), ndlimbs);
+
+    for (i = 0; i < M; i++)
+    {
+        for (j = 0; j < P; j++)
+        {
+            status |= _nfloat_vec_dot(GR_MAT_ENTRY(C, i, j, sz),
+                (block_start == 0) ? NULL : GR_MAT_ENTRY(C, i, j, sz), 0,
+                GR_MAT_ENTRY(A, i, block_start, sz),
+                tmpB + j * n * ndlimbs, n, ctx);
+        }
+    }
+
+    flint_free(tmpB);
+
+    return status;
+}
+
+int
+nfloat_mat_addmul_block_prescaled(gr_mat_t C,
+    const gr_mat_t A, const gr_mat_t B,
+    slong block_start,
+    slong block_end,
+    const slong * A_min,  /* A per-row bottom exponent */
+    const slong * B_min,  /* B per-row bottom exponent */
+    gr_ctx_t ctx)
+{
+    slong M, P, n;
+    slong i, j;
+    slong M0, M1, P0, P1, Mstep, Pstep;
+    int status = GR_SUCCESS;
+    slong sz = ctx->sizeof_elem;
+    ulong t[NFLOAT_MAX_ALLOC];
+    slong e;
+
+    M = A->r;
+    P = B->c;
+
+    n = block_end - block_start;
+
+    /* Create sub-blocks to keep matrices nearly square. Necessary? */
+#if 1
+    Mstep = (M < 2 * n) ? M : n;
+    Pstep = (P < 2 * n) ? P : n;
+#else
+    Mstep = M;
+    Pstep = P;
+#endif
+
+    for (M0 = 0; M0 < M; M0 += Mstep)
+    {
+        for (P0 = 0; P0 < P; P0 += Pstep)
+        {
+            fmpz_mat_t AA, BB, CC;
+
+            M1 = FLINT_MIN(M0 + Mstep, M);
+            P1 = FLINT_MIN(P0 + Pstep, P);
+
+            fmpz_mat_init(AA, M1 - M0, n);
+            fmpz_mat_init(BB, n, P1 - P0);
+            fmpz_mat_init(CC, M1 - M0, P1 - P0);
+
+            /* Convert to fixed-point matrices. */
+            for (i = M0; i < M1; i++)
+            {
+                if (A_min[i] == WORD_MIN)  /* only zeros in this row */
+                    continue;
+
+                for (j = 0; j < n; j++)
+                    _nfloat_2exp_get_fmpz(fmpz_mat_entry(AA, i - M0, j), GR_MAT_ENTRY(A, i, block_start + j, sz), A_min[i], ctx);
+            }
+
+            for (i = P0; i < P1; i++)
+            {
+                if (B_min[i] == WORD_MIN)  /* only zeros in this column */
+                    continue;
+
+                for (j = 0; j < n; j++)
+                    _nfloat_2exp_get_fmpz(fmpz_mat_entry(BB, j, i - P0), GR_MAT_ENTRY(B, block_start + j, i, sz), B_min[i], ctx);
+            }
+
+            /* The main multiplication */
+            fmpz_mat_mul(CC, AA, BB);
+            fmpz_mat_clear(AA);
+            fmpz_mat_clear(BB);
+
+            /* Add to the result matrix */
+            for (i = M0; i < M1; i++)
+            {
+                for (j = P0; j < P1; j++)
+                {
+                    e = A_min[i] + B_min[j];
+
+                    /* The first time we write this Cij */
+                    if (block_start == 0)
+                    {
+                        status |= nfloat_set_fmpz(GR_MAT_ENTRY(C, i, j, sz), fmpz_mat_entry(CC, i - M0, j - P0), ctx);
+                        status |= nfloat_mul_2exp_si(GR_MAT_ENTRY(C, i, j, sz), GR_MAT_ENTRY(C, i, j, sz), e, ctx);
+                    }
+                    else
+                    {
+                        status |= nfloat_set_fmpz(t, fmpz_mat_entry(CC, i - M0, j - P0), ctx);
+                        status |= nfloat_mul_2exp_si(t, t, e, ctx);
+                        status |= nfloat_add(GR_MAT_ENTRY(C, i, j, sz), GR_MAT_ENTRY(C, i, j, sz), t, ctx);
+                    }
+                }
+            }
+
+            fmpz_mat_clear(CC);
+        }
+    }
+
+    return status;
+}
+
+FLINT_FORCE_INLINE slong
+_nfloat_nbits(nfloat_srcptr x, slong nlimbs)
+{
+    nn_srcptr ad;
+    slong bits;
+
+    ad = NFLOAT_D(x);
+    bits = FLINT_BITS * nlimbs;
+
+    while (ad[0] == 0)
+    {
+        bits -= FLINT_BITS;
+        ad++;
+    }
+
+    bits -= flint_ctz(ad[0]);
+
+    return bits;
+}
+
+/* todo: squaring optimizations */
+int
+nfloat_mat_mul_block(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, slong min_block_size, gr_ctx_t ctx)
+{
+    slong M, N, P;
+    slong *A_min, *A_max, *B_min, *B_max;
+    short *A_bits, *B_bits;
+    slong *A_bot, *B_bot;
+    slong block_start, block_end, i, j, bot, top, max_height;
+    slong b, A_max_bits, B_max_bits;
+    nfloat_srcptr t;
+    double A_density, B_density;
+    slong sz = ctx->sizeof_elem;
+    slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+    slong prec = NFLOAT_CTX_PREC(ctx);
+    int status = GR_SUCCESS;
+
+    M = A->r;
+    N = A->c;
+    P = B->r;
+
+    if (N != B->r || M != C->r || P != C->c)
+        return GR_DOMAIN;
+
+    if (M == 0 || N == 0 || P == 0)
+        return gr_mat_zero(C, ctx);
+
+    if (NFLOAT_CTX_HAS_INF_NAN(ctx))
+        return GR_UNABLE;
+
+    if (A == C || B == C)
+    {
+        gr_mat_t T;
+        gr_mat_init(T, M, P, ctx);
+        status = nfloat_mat_mul_block(T, A, B, min_block_size, ctx);
+        status |= gr_mat_swap_entrywise(T, C, ctx);
+        gr_mat_clear(T, ctx);
+        return status;
+    }
+
+    /* bottom exponents of A */
+    A_bot = flint_malloc(sizeof(slong) * M * N);
+    /* minimum bottom exponent in current row */
+    A_min = flint_malloc(sizeof(slong) * M);
+    /* maximum top exponent in current row */
+    A_max = flint_malloc(sizeof(slong) * M);
+
+    B_bot = flint_malloc(sizeof(slong) * N * P);
+    B_min = flint_malloc(sizeof(slong) * P);
+    B_max = flint_malloc(sizeof(slong) * P);
+
+    /* save space using shorts to store the bit sizes temporarily;
+       the block algorithm will not be used at extremely high precision */
+    A_bits = flint_malloc(sizeof(short) * M * N);
+    B_bits = flint_malloc(sizeof(short) * N * P);
+
+    A_max_bits = B_max_bits = 0;
+    A_density = B_density = 0;
+
+    /* Build table of bottom exponents (WORD_MIN signifies a zero),
+       and also collect some statistics. */
+    for (i = 0; i < M; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            t = GR_MAT_ENTRY(A, i, j, sz);
+            if (NFLOAT_IS_ZERO(t))
+            {
+                A_bot[i * N + j] = WORD_MIN;
+                A_bits[i * N + j] = 0;
+            }
+            else
+            {
+                b = _nfloat_nbits(t, nlimbs);
+                A_bot[i * N + j] = NFLOAT_EXP(t) - b;
+                A_bits[i * N + j] = b;
+                A_max_bits = FLINT_MAX(A_max_bits, b);
+                A_density++;
+            }
+        }
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < P; j++)
+        {
+            t = GR_MAT_ENTRY(B, i, j, sz);
+            if (NFLOAT_IS_ZERO(t))
+            {
+                B_bot[i * P + j] = WORD_MIN;
+                B_bits[i * P + j] = 0;
+            }
+            else
+            {
+                b = _nfloat_nbits(t, nlimbs);
+                B_bot[i * P + j] = NFLOAT_EXP(t) - b;
+                B_bits[i * P + j] = b;
+                B_max_bits = FLINT_MAX(B_max_bits, b);
+                B_density++;
+            }
+        }
+    }
+
+    A_density = A_density / (M * N);
+    B_density = B_density / (N * P);
+
+    /* Don't shift too far when creating integer block matrices. */
+    max_height = 1.25 * FLINT_MIN(prec, FLINT_MAX(A_max_bits, B_max_bits)) + 192;
+
+    /* FIXME: this condition is bogus */
+    if (A_density < 0.1 && B_density < 0.1 && max_height > 1024)
+    {
+        status = gr_mat_mul_classical(C, A, B, ctx);
+        goto cleanup;
+    }
+
+    block_start = 0;
+    while (block_start < N)
+    {
+        /* Find a run of columns of A and rows of B such that the
+           bottom exponents differ by at most max_height. */
+
+        block_end = block_start + 1;  /* index is exclusive block_end */
+
+        /* begin with this column of A and row of B */
+        for (i = 0; i < M; i++)
+        {
+            A_max[i] = A_min[i] = A_bot[i * N + block_start];
+            A_max[i] += (slong) A_bits[i * N + block_start];
+        }
+
+        for (i = 0; i < P; i++)
+        {
+            B_max[i] = B_min[i] = B_bot[block_start * P + i];
+            B_max[i] += (slong) B_bits[block_start * P + i];
+        }
+
+        while (block_end < N)
+        {
+            double size;
+
+            /* End block if memory would be excessive. */
+            /* Necessary? */
+            /* Should also do initial check above, if C alone is too large. */
+            size = (block_end - block_start) * M * (double) A_max_bits;
+            size += (block_end - block_start) * P * (double) B_max_bits;
+            size += (M * P) * (double) (A_max_bits + B_max_bits);
+            size /= 8.0;
+            if (size > 2e9)
+                goto blocks_built;
+
+            /* check if we can extend with column [block_end] of A */
+            for (i = 0; i < M; i++)
+            {
+                bot = A_bot[i * N + block_end];
+                /* zeros are irrelevant */
+                if (bot == WORD_MIN || A_max[i] == WORD_MIN)
+                    continue;
+                top = bot + (slong) A_bits[i * N + block_end];
+                /* jump will be too big */
+                if (top > A_min[i] + max_height || bot < A_max[i] - max_height)
+                    goto blocks_built;
+            }
+
+            /* check if we can extend with row [block_end] of B */
+            for (i = 0; i < P; i++)
+            {
+                bot = B_bot[block_end * P + i];
+                if (bot == WORD_MIN || B_max[i] == WORD_MIN)
+                    continue;
+                top = bot + (slong) B_bits[block_end * P + i];
+                if (top > B_min[i] + max_height || bot < B_max[i] - max_height)
+                    goto blocks_built;
+            }
+
+            /* second pass to update the extreme values */
+            for (i = 0; i < M; i++)
+            {
+                bot = A_bot[i * N + block_end];
+                top = bot + (slong) A_bits[i * N + block_end];
+                if (A_max[i] == WORD_MIN)
+                {
+                    A_max[i] = top;
+                    A_min[i] = bot;
+                }
+                else if (bot != WORD_MIN)
+                {
+                    if (bot < A_min[i]) A_min[i] = bot;
+                    if (top > A_max[i]) A_max[i] = top;
+                }
+            }
+
+            for (i = 0; i < P; i++)
+            {
+                bot = B_bot[block_end * P + i];
+                top = bot + (slong) B_bits[block_end * P + i];
+                if (B_max[i] == WORD_MIN)
+                {
+                    B_max[i] = top;
+                    B_min[i] = bot;
+                }
+                else if (bot != WORD_MIN)
+                {
+                    if (bot < B_min[i]) B_min[i] = bot;
+                    if (top > B_max[i]) B_max[i] = top;
+                }
+            }
+
+            block_end++;
+        }
+
+    blocks_built:
+        if (block_end - block_start < min_block_size)
+        {
+            block_end = FLINT_MIN(N, block_start + min_block_size);
+            status |= nfloat_mat_addmul_block_fallback(C, A, B, block_start, block_end, ctx);
+        }
+        else
+        {
+            status |= nfloat_mat_addmul_block_prescaled(C, A, B, block_start, block_end, A_min, B_min, ctx);
+        }
+
+        block_start = block_end;
+    }
+
+cleanup:
+    flint_free(A_bot);
+    flint_free(A_max);
+    flint_free(A_min);
+    flint_free(B_bot);
+    flint_free(B_max);
+    flint_free(B_min);
+    flint_free(A_bits);
+    flint_free(B_bits);
+
+    return status;
+}
+
+/* Minimum precision for using fixed-point arithmetic */
+#define NFLOAT_MAT_MUL_FIXED_CUTOFF 192
+
+/* first cutoff:  classical -> fixed_classical
+   second cutoff: fixed_classical -> waksman */
+static const int nfloat_mat_mul_cutoff_tab[][2] = {
+    {0,  0},    /* prec = 0   */
+    {0,  0},    /* prec = 64  */
+    {0,  0},    /* prec = 128 */
+    {32, 32},   /* prec = 192 */
+    {8, 20},    /* prec = 256 */
+    {4, 15},    /* prec = 320 */
+    {3, 10},    /* prec = 384 */
+    {3, 10},    /* prec = 448 */
+    {3, 8},     /* prec = 512 */
+    {10, 10},   /* prec = 576 */
+    {4, 5},     /* prec = 640 */
+};
+
+/* {4, 4} from this point */
+#define NFLOAT_MAT_MUL_CUTOFF_4 704
+/* {3, 3} from this point */
+#define NFLOAT_MAT_MUL_CUTOFF_3 1600
+
+int
+nfloat_mat_mul(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+{
+    slong cutoff1, cutoff2, dim;
+    int use_waksman = 0;
+    slong prec;
+    slong max_extra_prec;
+
+    slong m = A->r;
+    slong n = A->c;
+    slong p = B->c;
+
+    dim = FLINT_MIN(n, FLINT_MIN(m, p));
+
+    if (dim <= 2 || NFLOAT_CTX_HAS_INF_NAN(ctx))
+        return gr_mat_mul_classical(C, A, B, ctx);
+
+    if (dim <= 80)
+    {
+        prec = NFLOAT_CTX_PREC(ctx);
+
+        if (prec < NFLOAT_MAT_MUL_FIXED_CUTOFF)
+            return gr_mat_mul_classical(C, A, B, ctx);
+
+        if (prec >= NFLOAT_MAT_MUL_CUTOFF_3)
+            cutoff1 = cutoff2 = 3;
+        else if (prec >= NFLOAT_MAT_MUL_CUTOFF_4)
+            cutoff1 = cutoff2 = 4;
+        else
+        {
+            cutoff1 = nfloat_mat_mul_cutoff_tab[prec / 64][0];
+            cutoff2 = nfloat_mat_mul_cutoff_tab[prec / 64][1];
+        }
+
+        if (dim < cutoff1)
+            return gr_mat_mul_classical(C, A, B, ctx);
+
+        use_waksman = (dim >= cutoff2);
+        max_extra_prec = (prec < 768) ? 64 : prec / 4;
+
+        return _nfloat_mat_mul_fixed(C, A, B, use_waksman, max_extra_prec, ctx);
+    }
+    else
+    {
+        return nfloat_mat_mul_block(C, A, B, 70, ctx);
+    }
+}
diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c
index ccf0293288..10cb533916 100644
--- a/src/nfloat/nfloat.c
+++ b/src/nfloat/nfloat.c
@@ -98,7 +98,7 @@ nfloat_randtest(nfloat_ptr res, flint_rand_t state, gr_ctx_t ctx)
     int status;
 
     arf_init(t);
-    arf_randtest(t, state, NFLOAT_CTX_PREC(ctx), 10);
+    arf_randtest(t, state, NFLOAT_CTX_PREC(ctx), n_randint(state, 2) ? 2 : 10);
     status = nfloat_set_arf(res, t, ctx);
     arf_clear(t);
     return status;
diff --git a/src/nfloat/test/main.c b/src/nfloat/test/main.c
index 0a129221f1..8520994f89 100644
--- a/src/nfloat/test/main.c
+++ b/src/nfloat/test/main.c
@@ -13,6 +13,7 @@
 
 #include "t-add_sub_n.c"
 #include "t-addmul_submul.c"
+#include "t-mat_mul.c"
 #include "t-nfloat.c"
 #include "t-nfloat_complex.c"
 
@@ -22,6 +23,7 @@ test_struct tests[] =
 {
     TEST_FUNCTION(add_sub_n),
     TEST_FUNCTION(addmul_submul),
+    TEST_FUNCTION(mat_mul),
     TEST_FUNCTION(nfloat),
     TEST_FUNCTION(nfloat_complex),
 };
diff --git a/src/nfloat/test/t-mat_mul.c b/src/nfloat/test/t-mat_mul.c
new file mode 100644
index 0000000000..4ac640931b
--- /dev/null
+++ b/src/nfloat/test/t-mat_mul.c
@@ -0,0 +1,67 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "test_helpers.h"
+#include "gr.h"
+#include "gr_mat.h"
+#include "nfloat.h"
+
+int
+nfloat_mat_mul_block1(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+{
+    return nfloat_mat_mul_block(C, A, B, 1, ctx);
+}
+
+TEST_FUNCTION_START(mat_mul, state)
+{
+    gr_ctx_t ctx;
+    slong prec;
+    slong iter;
+    gr_ptr tol;
+
+    for (iter = 0; iter < 100 * flint_test_multiplier(); iter++)
+    {
+        if (n_randint(state, 5))
+            prec = FLINT_BITS * (1 + n_randint(state, 4));
+        else
+            prec = FLINT_BITS * (1 + n_randint(state, NFLOAT_MAX_LIMBS));
+
+        nfloat_ctx_init(ctx, prec, 0);
+
+        tol = gr_heap_init(ctx);
+        GR_MUST_SUCCEED(gr_one(tol, ctx));
+        GR_MUST_SUCCEED(gr_mul_2exp_si(tol, tol, -prec + 6, ctx));
+
+        gr_mat_test_approx_mul_pos_entrywise_accurate(
+            (gr_method_mat_binary_op) _nfloat_mat_mul_waksman,
+            tol, state, (prec <= 256) ? 10 : 1, 10, ctx);
+
+        gr_mat_test_approx_mul_pos_entrywise_accurate(
+            (gr_method_mat_binary_op) nfloat_mat_mul_block1,
+            tol, state, (prec <= 256) ? 10 : 1,
+                        (prec <= 256) ? 40 : 20, ctx);
+
+        gr_mat_test_approx_mul_pos_entrywise_accurate(
+            (gr_method_mat_binary_op) _nfloat_mat_mul_fixed_classical,
+            tol, state, (prec <= 256) ? 10 : 1,
+                        (prec <= 256) ? 40 : 20, ctx);
+
+        if (n_randint(state, 4) == 0)
+            gr_mat_test_approx_mul_pos_entrywise_accurate(
+                (gr_method_mat_binary_op) nfloat_mat_mul,
+                tol, state, 1, 120, ctx);
+
+        gr_heap_clear(tol, ctx);
+        gr_ctx_clear(ctx);
+    }
+
+    TEST_FUNCTION_END(state);
+}

From 6e69daa63e77f83f1eeea99ae35220dcc326ee76 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Sat, 13 Jul 2024 16:57:02 +0200
Subject: [PATCH 4/9] followup

---
 doc/source/nfloat.rst | 8 ++++++++
 src/nfloat/ctx.c      | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/source/nfloat.rst b/doc/source/nfloat.rst
index 3f1d82e391..763d129f9a 100644
--- a/doc/source/nfloat.rst
+++ b/doc/source/nfloat.rst
@@ -314,6 +314,14 @@ code for reduced overhead.
 .. function:: int _nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
               int _nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
 
+Matrix functions
+-------------------------------------------------------------------------------
+
+.. function:: int _nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+              int _nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+              int nfloat_mat_mul_block(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, slong min_block_size, gr_ctx_t ctx)
+              int nfloat_mat_mul(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+
 Internal functions
 -------------------------------------------------------------------------------
 
diff --git a/src/nfloat/ctx.c b/src/nfloat/ctx.c
index 250f15bf8d..6178b877cc 100644
--- a/src/nfloat/ctx.c
+++ b/src/nfloat/ctx.c
@@ -173,8 +173,8 @@ gr_method_tab_input _nfloat_methods_input[] =
 /*
     {GR_METHOD_POLY_MULLOW,     (gr_funcptr) nfloat_poly_mullow},
     {GR_METHOD_POLY_ROOTS_OTHER,(gr_funcptr) nfloat_poly_roots_other},
-    {GR_METHOD_MAT_MUL,         (gr_funcptr) nfloat_mat_mul},
 */
+    {GR_METHOD_MAT_MUL,         (gr_funcptr) nfloat_mat_mul},
     {GR_METHOD_MAT_DET,         (gr_funcptr) gr_mat_det_generic_field},
     {GR_METHOD_MAT_FIND_NONZERO_PIVOT,     (gr_funcptr) gr_mat_find_nonzero_pivot_large_abs},
 

From 61c0c1dfefe86521205cb2f0ed105e57b7be21ed Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Sun, 14 Jul 2024 15:21:44 +0200
Subject: [PATCH 5/9] gr_min, gr_max

---
 doc/source/gr.rst          |   4 +
 src/gr.h                   |   3 +
 src/gr/acb.c               |  43 +++++---
 src/gr/arb.c               | 197 +++++++++++++------------------------
 src/gr_generic/generic.c   |  30 ++++++
 src/python/flint_ctypes.py |  35 +++++++
 6 files changed, 169 insertions(+), 143 deletions(-)

diff --git a/doc/source/gr.rst b/doc/source/gr.rst
index c535e86b20..79992c8a4d 100644
--- a/doc/source/gr.rst
+++ b/doc/source/gr.rst
@@ -856,6 +856,10 @@ Ordering methods
     Wrappers of ``gr_cmp`` and ``gr_cmpabs`` returning truth values
     for the comparison operations ``<=``, ``<``, ``>=``, ``>``.
 
+.. function:: int gr_min(gr_ptr res, gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+              int gr_max(gr_ptr res, gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+
+    Minimum and maximum value.
 
 Enclosure and interval methods
 ........................................................................
diff --git a/src/gr.h b/src/gr.h
index 347ab28a8f..63cb0d7fcd 100644
--- a/src/gr.h
+++ b/src/gr.h
@@ -1125,6 +1125,9 @@ GR_INLINE WARN_UNUSED_RESULT truth_t gr_abs_lt(gr_srcptr x, gr_srcptr y, gr_ctx_
 GR_INLINE WARN_UNUSED_RESULT truth_t gr_abs_ge(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmpabs, cmp >= 0) }
 GR_INLINE WARN_UNUSED_RESULT truth_t gr_abs_gt(gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { __GR_CMP(gr_cmpabs, cmp > 0) }
 
+GR_INLINE WARN_UNUSED_RESULT int gr_min(gr_ptr res, gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { return GR_BINARY_OP(ctx, MIN)(res, x, y, ctx); }
+GR_INLINE WARN_UNUSED_RESULT int gr_max(gr_ptr res, gr_srcptr x, gr_srcptr y, gr_ctx_t ctx) { return GR_BINARY_OP(ctx, MAX)(res, x, y, ctx); }
+
 GR_INLINE WARN_UNUSED_RESULT int gr_gen(gr_ptr res, gr_ctx_t ctx) { return GR_CONSTANT_OP(ctx, GEN)(res, ctx); }
 GR_INLINE WARN_UNUSED_RESULT int gr_gens(gr_vec_t res, gr_ctx_t ctx) { return GR_VEC_CTX_OP(ctx, GENS)(res, ctx); }
 GR_INLINE WARN_UNUSED_RESULT int gr_gens_recursive(gr_vec_t res, gr_ctx_t ctx) { return GR_VEC_CTX_OP(ctx, GENS_RECURSIVE)(res, ctx); }
diff --git a/src/gr/acb.c b/src/gr/acb.c
index 3f7cb4e08a..0a14fbd78e 100644
--- a/src/gr/acb.c
+++ b/src/gr/acb.c
@@ -1004,6 +1004,32 @@ _gr_acb_cmpabs(int * res, const acb_t x, const acb_t y, const gr_ctx_t ctx)
     return _gr_acb_cmp(res, t, u, ctx);
 }
 
+int
+_gr_acb_min(acb_t res, const acb_t x, const acb_t y, const gr_ctx_t ctx)
+{
+    if (arb_is_zero(acb_imagref(x)) && arb_is_zero(acb_imagref(y)))
+    {
+        arb_min(acb_realref(res), acb_realref(x), acb_realref(y), ACB_CTX_PREC(ctx));
+        arb_zero(acb_imagref(res));
+        return GR_SUCCESS;
+    }
+    else
+        return GR_UNABLE;
+}
+
+int
+_gr_acb_max(acb_t res, const acb_t x, const acb_t y, const gr_ctx_t ctx)
+{
+    if (arb_is_zero(acb_imagref(x)) && arb_is_zero(acb_imagref(y)))
+    {
+        arb_max(acb_realref(res), acb_realref(x), acb_realref(y), ACB_CTX_PREC(ctx));
+        arb_zero(acb_imagref(res));
+        return GR_SUCCESS;
+    }
+    else
+        return GR_UNABLE;
+}
+
 int
 _gr_acb_pi(acb_t res, const gr_ctx_t ctx)
 {
@@ -1330,13 +1356,7 @@ _gr_acb_gamma_fmpq(acb_t res, const fmpq_t x, const gr_ctx_t ctx)
     }
 }
 
-
-int
-_gr_acb_rgamma(acb_t res, const acb_t x, const gr_ctx_t ctx)
-{
-    acb_rgamma(res, x, ACB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
+DEF_FUNC(rgamma)
 
 int
 _gr_acb_lgamma(acb_t res, const acb_t x, const gr_ctx_t ctx)
@@ -1642,12 +1662,7 @@ int _gr_acb_stieltjes(acb_t res, const fmpz_t n, const acb_t a, const gr_ctx_t c
     return acb_is_finite(res) ? GR_SUCCESS : GR_UNABLE;
 }
 
-int
-_gr_acb_dirichlet_eta(acb_t res, const acb_t x, const gr_ctx_t ctx)
-{
-    acb_dirichlet_eta(res, x, ACB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
+DEF_FUNC(dirichlet_eta)
 
 /* todo
 int
@@ -2220,6 +2235,8 @@ gr_method_tab_input _acb_methods_input[] =
     {GR_METHOD_ARG,             (gr_funcptr) _gr_acb_arg},
     {GR_METHOD_CMP,             (gr_funcptr) _gr_acb_cmp},
     {GR_METHOD_CMPABS,          (gr_funcptr) _gr_acb_cmpabs},
+    {GR_METHOD_MIN,             (gr_funcptr) _gr_acb_min},
+    {GR_METHOD_MAX,             (gr_funcptr) _gr_acb_max},
     {GR_METHOD_PI,              (gr_funcptr) _gr_acb_pi},
     {GR_METHOD_EXP,             (gr_funcptr) _gr_acb_exp},
     {GR_METHOD_EXPM1,           (gr_funcptr) _gr_acb_expm1},
diff --git a/src/gr/arb.c b/src/gr/arb.c
index 6f315dd685..cb9fb88f58 100644
--- a/src/gr/arb.c
+++ b/src/gr/arb.c
@@ -33,6 +33,55 @@ gr_arb_ctx;
 
 #define ARB_CTX_PREC(ring_ctx) (((gr_arb_ctx *)((ring_ctx)))->prec)
 
+#define DEF_FUNC(fname) \
+int \
+_gr_arb_ ## fname(arb_t res, const arb_t x, const gr_ctx_t ctx) \
+{ \
+    arb_ ## fname(res, x, ARB_CTX_PREC(ctx)); \
+    return GR_SUCCESS; \
+} \
+
+#define DEF_FUNC_NOPREC(fname) \
+int \
+_gr_arb_ ## fname(arb_t res, const arb_t x, const gr_ctx_t ctx) \
+{ \
+    arb_ ## fname(res, x); \
+    return GR_SUCCESS; \
+} \
+
+
+#define DEF_2FUNC(fname) \
+int \
+_gr_arb_ ## fname(arb_t res1, arb_t res2, const arb_t x, const gr_ctx_t ctx) \
+{ \
+    arb_ ## fname(res1, res2, x, ARB_CTX_PREC(ctx)); \
+    return GR_SUCCESS; \
+} \
+
+#define DEF_FUNC2(fname) \
+int \
+_gr_arb_ ## fname(arb_t res, const arb_t x, const arb_t y, const gr_ctx_t ctx) \
+{ \
+    arb_ ## fname(res, x, y, ARB_CTX_PREC(ctx)); \
+    return GR_SUCCESS; \
+} \
+
+#define DEF_FUNC_SING(fname) \
+int \
+_gr_arb_ ## fname(arb_t res, const arb_t x, const gr_ctx_t ctx) \
+{ \
+    arb_ ## fname(res, x, ARB_CTX_PREC(ctx)); \
+    return arb_is_finite(res) ? GR_SUCCESS : GR_UNABLE; \
+} \
+
+#define DEF_FUNC2_SING(fname) \
+int \
+_gr_arb_ ## fname(arb_t res, const arb_t x, const arb_t y, const gr_ctx_t ctx) \
+{ \
+    arb_ ## fname(res, x, y, ARB_CTX_PREC(ctx)); \
+    return arb_is_finite(res) ? GR_SUCCESS : GR_UNABLE; \
+} \
+
 int _gr_arb_ctx_set_real_prec(gr_ctx_t ctx, slong prec)
 {
     prec = FLINT_MAX(prec, 2);
@@ -407,26 +456,13 @@ _gr_arb_equal(const arb_t x, const arb_t y, const gr_ctx_t ctx)
     return T_FALSE;
 }
 
-int
-_gr_arb_set(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_set(res, x);
-    return GR_SUCCESS;
-}
-
-int
-_gr_arb_neg(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_neg(res, x);
-    return GR_SUCCESS;
-}
-
-int
-_gr_arb_add(arb_t res, const arb_t x, const arb_t y, const gr_ctx_t ctx)
-{
-    arb_add(res, x, y, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
+DEF_FUNC_NOPREC(set)
+DEF_FUNC_NOPREC(neg)
+DEF_FUNC2(add)
+DEF_FUNC2(sub)
+DEF_FUNC2(addmul)
+DEF_FUNC2(submul)
+DEF_FUNC(sqr)
 
 int
 _gr_arb_add_si(arb_t res, const arb_t x, slong y, const gr_ctx_t ctx)
@@ -449,13 +485,6 @@ _gr_arb_add_fmpz(arb_t res, const arb_t x, const fmpz_t y, const gr_ctx_t ctx)
     return GR_SUCCESS;
 }
 
-int
-_gr_arb_sub(arb_t res, const arb_t x, const arb_t y, const gr_ctx_t ctx)
-{
-    arb_sub(res, x, y, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
-
 int
 _gr_arb_sub_si(arb_t res, const arb_t x, slong y, const gr_ctx_t ctx)
 {
@@ -505,20 +534,6 @@ _gr_arb_mul_fmpz(arb_t res, const arb_t x, const fmpz_t y, const gr_ctx_t ctx)
     return GR_SUCCESS;
 }
 
-int
-_gr_arb_addmul(arb_t res, const arb_t x, const arb_t y, const gr_ctx_t ctx)
-{
-    arb_addmul(res, x, y, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
-
-int
-_gr_arb_submul(arb_t res, const arb_t x, const arb_t y, const gr_ctx_t ctx)
-{
-    arb_submul(res, x, y, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
-
 int
 _gr_arb_mul_two(arb_t res, const arb_t x, const gr_ctx_t ctx)
 {
@@ -526,13 +541,6 @@ _gr_arb_mul_two(arb_t res, const arb_t x, const gr_ctx_t ctx)
     return GR_SUCCESS;
 }
 
-int
-_gr_arb_sqr(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_sqr(res, x, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
-
 int
 _gr_arb_mul_2exp_si(arb_t res, const arb_t x, slong y, const gr_ctx_t ctx)
 {
@@ -810,40 +818,12 @@ _gr_arb_rsqrt(arb_t res, const arb_t x, const gr_ctx_t ctx)
     }
 }
 
-int
-_gr_arb_floor(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_floor(res, x, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
-
-int
-_gr_arb_ceil(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_ceil(res, x, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
-
-int
-_gr_arb_trunc(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_trunc(res, x, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
-
-int
-_gr_arb_nint(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_nint(res, x, ARB_CTX_PREC(ctx));
-    return GR_SUCCESS;
-}
-
-int
-_gr_arb_abs(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_abs(res, x);
-    return GR_SUCCESS;
-}
+DEF_FUNC(floor)
+DEF_FUNC(ceil)
+DEF_FUNC(trunc)
+DEF_FUNC(nint)
+DEF_FUNC_NOPREC(abs)
+DEF_FUNC_NOPREC(sgn)
 
 int
 _gr_arb_conj(arb_t res, const arb_t x, const gr_ctx_t ctx)
@@ -859,13 +839,6 @@ _gr_arb_im(arb_t res, const arb_t x, const gr_ctx_t ctx)
     return GR_SUCCESS;
 }
 
-int
-_gr_arb_sgn(arb_t res, const arb_t x, const gr_ctx_t ctx)
-{
-    arb_sgn(res, x);
-    return GR_SUCCESS;
-}
-
 int
 _gr_arb_arg(arb_t res, const arb_t x, const gr_ctx_t ctx)
 {
@@ -956,47 +929,6 @@ _gr_arb_glaisher(arb_t res, const gr_ctx_t ctx)
     return GR_SUCCESS;
 }
 
-#define DEF_FUNC(fname) \
-int \
-_gr_arb_ ## fname(arb_t res, const arb_t x, const gr_ctx_t ctx) \
-{ \
-    arb_ ## fname(res, x, ARB_CTX_PREC(ctx)); \
-    return GR_SUCCESS; \
-} \
-
-#define DEF_2FUNC(fname) \
-int \
-_gr_arb_ ## fname(arb_t res1, arb_t res2, const arb_t x, const gr_ctx_t ctx) \
-{ \
-    arb_ ## fname(res1, res2, x, ARB_CTX_PREC(ctx)); \
-    return GR_SUCCESS; \
-} \
-
-#define DEF_FUNC2(fname) \
-int \
-_gr_arb_ ## fname(arb_t res, const arb_t x, const arb_t y, const gr_ctx_t ctx) \
-{ \
-    arb_ ## fname(res, x, y, ARB_CTX_PREC(ctx)); \
-    return GR_SUCCESS; \
-} \
-
-#define DEF_FUNC_SING(fname) \
-int \
-_gr_arb_ ## fname(arb_t res, const arb_t x, const gr_ctx_t ctx) \
-{ \
-    arb_ ## fname(res, x, ARB_CTX_PREC(ctx)); \
-    return arb_is_finite(res) ? GR_SUCCESS : GR_UNABLE; \
-} \
-
-#define DEF_FUNC2_SING(fname) \
-int \
-_gr_arb_ ## fname(arb_t res, const arb_t x, const arb_t y, const gr_ctx_t ctx) \
-{ \
-    arb_ ## fname(res, x, y, ARB_CTX_PREC(ctx)); \
-    return arb_is_finite(res) ? GR_SUCCESS : GR_UNABLE; \
-} \
-
-
 DEF_FUNC(exp)
 DEF_FUNC(expm1)
 DEF_FUNC_SING(log1p)
@@ -1016,6 +948,9 @@ _gr_arb_log(arb_t res, const arb_t x, const gr_ctx_t ctx)
     return GR_UNABLE;
 }
 
+DEF_FUNC2(min)
+DEF_FUNC2(max)
+
 DEF_FUNC(sin)
 DEF_FUNC(cos)
 DEF_2FUNC(sin_cos)
@@ -1863,6 +1798,8 @@ gr_method_tab_input _arb_methods_input[] =
     {GR_METHOD_ARG,             (gr_funcptr) _gr_arb_arg},
     {GR_METHOD_CMP,             (gr_funcptr) _gr_arb_cmp},
     {GR_METHOD_CMPABS,          (gr_funcptr) _gr_arb_cmpabs},
+    {GR_METHOD_MIN,             (gr_funcptr) _gr_arb_min},
+    {GR_METHOD_MAX,             (gr_funcptr) _gr_arb_max},
     {GR_METHOD_I,               (gr_funcptr) gr_not_in_domain},
     {GR_METHOD_PI,              (gr_funcptr) _gr_arb_pi},
     {GR_METHOD_EULER,           (gr_funcptr) _gr_arb_euler},
diff --git a/src/gr_generic/generic.c b/src/gr_generic/generic.c
index 7b343c7828..0908526a1f 100644
--- a/src/gr_generic/generic.c
+++ b/src/gr_generic/generic.c
@@ -1317,6 +1317,34 @@ gr_generic_cmpabs_other(int * res, gr_srcptr x, gr_srcptr y, gr_ctx_t y_ctx, gr_
     return status;
 }
 
+int
+gr_generic_min(gr_ptr res, gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+{
+    int cmp;
+
+    if (gr_cmp(&cmp, x, y, ctx) != GR_SUCCESS)
+        return GR_UNABLE;
+
+    if (cmp <= 0)
+        return gr_set(res, x, ctx);
+    else
+        return gr_set(res, y, ctx);
+}
+
+int
+gr_generic_max(gr_ptr res, gr_srcptr x, gr_srcptr y, gr_ctx_t ctx)
+{
+    int cmp;
+
+    if (gr_cmp(&cmp, x, y, ctx) != GR_SUCCESS)
+        return GR_UNABLE;
+
+    if (cmp >= 0)
+        return gr_set(res, x, ctx);
+    else
+        return gr_set(res, y, ctx);
+}
+
 int
 gr_generic_bernoulli_ui(gr_ptr res, ulong n, gr_ctx_t ctx)
 {
@@ -2664,6 +2692,8 @@ const gr_method_tab_input _gr_generic_methods[] =
     {GR_METHOD_CMPABS,                  (gr_funcptr) gr_generic_cmpabs},
     {GR_METHOD_CMP_OTHER,               (gr_funcptr) gr_generic_cmp_other},
     {GR_METHOD_CMPABS_OTHER,            (gr_funcptr) gr_generic_cmpabs_other},
+    {GR_METHOD_MIN,                     (gr_funcptr) gr_generic_min},
+    {GR_METHOD_MAX,                     (gr_funcptr) gr_generic_max},
 
     {GR_METHOD_EXP,                     (gr_funcptr) gr_generic_exp},
     {GR_METHOD_EXPM1,                   (gr_funcptr) gr_generic_expm1},
diff --git a/src/python/flint_ctypes.py b/src/python/flint_ctypes.py
index d2641b32e3..7446eef26c 100644
--- a/src/python/flint_ctypes.py
+++ b/src/python/flint_ctypes.py
@@ -1196,6 +1196,41 @@ def csgn(ctx, x):
     def arg(ctx, x):
         return ctx._unary_op(x, libgr.gr_arg, "arg($x)")
 
+    def min(ctx, x, y):
+        """
+            >>> QQ.min(QQ(1)/3, QQ(1)/4)
+            1/4
+            >>> RR.min(3, RR.pi())
+            3.000000000000000
+            >>> RR.min(RR("11 +/- 1"), RR("12 +/- 3"))
+            [1e+1 +/- 2.01]
+            >>> CC.min(2, 3)
+            2.000000000000000
+            >>> CC.min(2, CC.i())
+            Traceback (most recent call last):
+              ...
+            FlintUnableError: failed to compute min(x, y) in {Complex numbers (acb, prec = 53)} for {x = 2.000000000000000}, {y = 1.000000000000000*I}
+
+        """
+        return ctx._binary_op(x, y, libgr.gr_min, "min($x, $y)")
+
+    def max(ctx, x, y):
+        """
+            >>> QQ.max(QQ(1)/3, QQ(1)/4)
+            1/3
+            >>> RR.max(3, RR.pi())
+            [3.141592653589793 +/- 5.61e-16]
+            >>> RR.max(RR("10 +/- 1"), RR("9 +/- 3"))
+            [1e+1 +/- 2.01]
+            >>> CC.max(2, 3)
+            3.000000000000000
+            >>> CC.max(2, CC.i())
+            Traceback (most recent call last):
+              ...
+            FlintUnableError: failed to compute max(x, y) in {Complex numbers (acb, prec = 53)} for {x = 2.000000000000000}, {y = 1.000000000000000*I}
+        """
+        return ctx._binary_op(x, y, libgr.gr_max, "max($x, $y)")
+
     def inf(ctx):
         """
         Positive infinity (for extended number sets which support it).

From 2adcb66989f63b0d536a3cd39b0cd954e57e0c83 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Sun, 14 Jul 2024 15:54:38 +0200
Subject: [PATCH 6/9] some gr_mat norms (1, inf, max, frobenius)

---
 doc/source/gr_mat.rst      |  18 ++++
 src/gr_mat.h               |   5 ++
 src/gr_mat/norm.c          | 167 +++++++++++++++++++++++++++++++++++++
 src/python/flint_ctypes.py |  54 ++++++++++++
 4 files changed, 244 insertions(+)
 create mode 100644 src/gr_mat/norm.c

diff --git a/doc/source/gr_mat.rst b/doc/source/gr_mat.rst
index 68580ef8d7..66a179d077 100644
--- a/doc/source/gr_mat.rst
+++ b/doc/source/gr_mat.rst
@@ -248,6 +248,24 @@ Entrywise operations
     in *mat1* paired with the corresponding entries in *mat2*.
     Returns ``T_FALSE`` if the matrix dimensions are not compatible.
 
+Norms
+-------------------------------------------------------------------------------
+
+.. function:: int gr_mat_norm_max(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx)
+
+    Max norm: `\max_{i,j} |a_{i,j}|`.
+
+.. function:: int gr_mat_norm_1(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx)
+
+    1-norm (largest absolute column sum): `\max_{1\le j \le n} \sum_{i=1}^m |a_{i,j}|`.
+
+.. function:: int gr_mat_norm_inf(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx)
+
+    Infinity-norm (largest absolute row sum): `\max_{1\le i \le m} \sum_{j=1}^n |a_{i,j}|`.
+
+.. function:: int gr_mat_norm_frobenius(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx)
+
+    Frobenius norm: `\sqrt{\sum_{i,j} |a_{i,j}|^2}`.
 
 Arithmetic
 -------------------------------------------------------------------------------
diff --git a/src/gr_mat.h b/src/gr_mat.h
index 4f5a15af1a..d8f3cd282d 100644
--- a/src/gr_mat.h
+++ b/src/gr_mat.h
@@ -290,6 +290,11 @@ WARN_UNUSED_RESULT int gr_mat_exp(gr_mat_t res, const gr_mat_t A, gr_ctx_t ctx);
 WARN_UNUSED_RESULT int gr_mat_log_jordan(gr_mat_t res, const gr_mat_t A, gr_ctx_t ctx);
 WARN_UNUSED_RESULT int gr_mat_log(gr_mat_t res, const gr_mat_t A, gr_ctx_t ctx);
 
+WARN_UNUSED_RESULT int gr_mat_norm_max(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx);
+WARN_UNUSED_RESULT int gr_mat_norm_1(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx);
+WARN_UNUSED_RESULT int gr_mat_norm_inf(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx);
+WARN_UNUSED_RESULT int gr_mat_norm_frobenius(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx);
+
 /* Test functions */
 
 void gr_mat_test_mul(gr_method_mat_binary_op mul_impl, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
diff --git a/src/gr_mat/norm.c b/src/gr_mat/norm.c
new file mode 100644
index 0000000000..c5cc7abdad
--- /dev/null
+++ b/src/gr_mat/norm.c
@@ -0,0 +1,167 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "gr.h"
+#include "gr_mat.h"
+
+/* todo: allow overloading the following methods (or at least use
+         vector functions)
+   todo: quick bound versions */
+
+int
+gr_mat_norm_max(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+    gr_ptr t;
+
+    R = gr_mat_nrows(mat, ctx);
+    C = gr_mat_ncols(mat, ctx);
+
+    if (R == 0 || C == 0)
+        return gr_zero(res, ctx);
+
+    GR_TMP_INIT(t, ctx);
+
+    for (i = 0; i < R; i++)
+    {
+        for (j = 0; j < C; j++)
+        {
+            if (i == 0 && j == 0)
+                status |= gr_abs(res, GR_MAT_ENTRY(mat, i, j, sz), ctx);
+            else
+            {
+                status |= gr_abs(t, GR_MAT_ENTRY(mat, i, j, sz), ctx);
+                status |= gr_max(res, res, t, ctx);
+            }
+        }
+    }
+
+    GR_TMP_CLEAR(t, ctx);
+
+    return status;
+}
+
+int
+gr_mat_norm_1(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+    gr_ptr s, t;
+
+    R = gr_mat_nrows(mat, ctx);
+    C = gr_mat_ncols(mat, ctx);
+
+    if (R == 0 || C == 0)
+        return gr_zero(res, ctx);
+
+    GR_TMP_INIT2(s, t, ctx);
+
+    for (j = 0; j < C; j++)
+    {
+        for (i = 0; i < R; i++)
+        {
+            if (i == 0)
+                status |= gr_abs(s, GR_MAT_ENTRY(mat, i, j, sz), ctx);
+            else
+            {
+                status |= gr_abs(t, GR_MAT_ENTRY(mat, i, j, sz), ctx);
+                status |= gr_add(s, s, t, ctx);
+            }
+        }
+
+        if (j == 0)
+            gr_swap(res, s, ctx);
+        else
+            status |= gr_max(res, res, s, ctx);
+    }
+
+    GR_TMP_CLEAR2(s, t, ctx);
+
+    return status;
+}
+
+int
+gr_mat_norm_inf(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+    gr_ptr s, t;
+
+    R = gr_mat_nrows(mat, ctx);
+    C = gr_mat_ncols(mat, ctx);
+
+    if (R == 0 || C == 0)
+        return gr_zero(res, ctx);
+
+    GR_TMP_INIT2(s, t, ctx);
+
+    for (i = 0; i < R; i++)
+    {
+        for (j = 0; j < C; j++)
+        {
+            if (j == 0)
+                status |= gr_abs(s, GR_MAT_ENTRY(mat, i, j, sz), ctx);
+            else
+            {
+                status |= gr_abs(t, GR_MAT_ENTRY(mat, i, j, sz), ctx);
+                status |= gr_add(s, s, t, ctx);
+            }
+        }
+
+        if (i == 0)
+            gr_swap(res, s, ctx);
+        else
+            status |= gr_max(res, res, s, ctx);
+    }
+
+    GR_TMP_CLEAR2(s, t, ctx);
+
+    return status;
+}
+
+int
+gr_mat_norm_frobenius(gr_ptr res, const gr_mat_t mat, gr_ctx_t ctx)
+{
+    slong R, C, i, j;
+    slong sz = ctx->sizeof_elem;
+    int status = GR_SUCCESS;
+    gr_ptr t;
+
+    R = gr_mat_nrows(mat, ctx);
+    C = gr_mat_ncols(mat, ctx);
+
+    if (R == 0 || C == 0)
+        return gr_zero(res, ctx);
+
+    GR_TMP_INIT(t, ctx);
+
+    status |= gr_zero(res, ctx);
+
+    for (i = 0; i < R; i++)
+    {
+        for (j = 0; j < C; j++)
+        {
+            status |= gr_abs(t, GR_MAT_ENTRY(mat, i, j, sz), ctx);
+            status |= gr_sqr(t, t, ctx);
+            status |= gr_add(res, res, t, ctx);
+        }
+    }
+
+    status |= gr_sqrt(res, res, ctx);
+
+    GR_TMP_CLEAR(t, ctx);
+
+    return status;
+}
diff --git a/src/python/flint_ctypes.py b/src/python/flint_ctypes.py
index 7446eef26c..f12975d88c 100644
--- a/src/python/flint_ctypes.py
+++ b/src/python/flint_ctypes.py
@@ -5509,6 +5509,60 @@ def __setitem__(self, ij, v):
             if status & GR_DOMAIN: raise ValueError
         return x
 
+    def norm_max(self):
+        """
+            >>> Mat(RR)([[1,2,3],[4,5,6],[7,8,9]]).norm_max()
+            9.000000000000000
+        """
+        element_ring = self.parent()._element_ring
+        res = element_ring()
+        status = libgr.gr_mat_norm_max(res._ref, self._ref, element_ring._ref)
+        if status:
+            if status & GR_UNABLE: raise NotImplementedError
+            if status & GR_DOMAIN: raise ValueError
+        return res
+
+
+    def norm_1(self):
+        """
+            >>> Mat(RR)([[1,2,3],[4,5,6],[7,8,9]]).norm_1()
+            18.00000000000000
+        """
+        element_ring = self.parent()._element_ring
+        res = element_ring()
+        status = libgr.gr_mat_norm_1(res._ref, self._ref, element_ring._ref)
+        if status:
+            if status & GR_UNABLE: raise NotImplementedError
+            if status & GR_DOMAIN: raise ValueError
+        return res
+
+    def norm_inf(self):
+        """
+            >>> Mat(RR)([[1,2,3],[4,5,6],[7,8,9]]).norm_inf()
+            24.00000000000000
+        """
+        element_ring = self.parent()._element_ring
+        res = element_ring()
+        status = libgr.gr_mat_norm_inf(res._ref, self._ref, element_ring._ref)
+        if status:
+            if status & GR_UNABLE: raise NotImplementedError
+            if status & GR_DOMAIN: raise ValueError
+        return res
+
+    def norm_frobenius(self):
+        """
+            >>> Mat(RR)([[1,2,3],[4,5,6],[7,8,9]]).norm_frobenius()
+            [16.88194301613413 +/- 3.73e-15]
+        """
+        element_ring = self.parent()._element_ring
+        res = element_ring()
+        status = libgr.gr_mat_norm_frobenius(res._ref, self._ref, element_ring._ref)
+        if status:
+            if status & GR_UNABLE: raise NotImplementedError
+            if status & GR_DOMAIN: raise ValueError
+        return res
+
+
     def nullspace(self):
         """
         Right kernel (nullspace) of this matrix.

From 0cd1a025212b0814bdf821e042eca929557d3e4d Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Sun, 14 Jul 2024 16:12:43 +0200
Subject: [PATCH 7/9] more approx matrix mul test code

---
 doc/source/gr_mat.rst        |   7 +++
 src/gr_mat.h                 |   1 +
 src/gr_mat/test_approx_mul.c | 115 +++++++++++++++++++++++++++++++++++
 src/nfloat/test/t-mat_mul.c  |  36 +++++++++++
 4 files changed, 159 insertions(+)

diff --git a/doc/source/gr_mat.rst b/doc/source/gr_mat.rst
index 66a179d077..93d8d2b9fa 100644
--- a/doc/source/gr_mat.rst
+++ b/doc/source/gr_mat.rst
@@ -855,6 +855,13 @@ on each test iteration, otherwise the given ring is tested.
     Tests the given function ``solve_impl`` for correctness as an implementation
     of :func:`gr_mat_nonsingular_solve_tril` / :func:`gr_mat_nonsingular_solve_triu`.
 
+.. function:: void gr_mat_test_approx_mul_max_norm(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx)
+
+    Tests the given implementation of matrix multiplication for accuracy
+    over an approximate numerical ring by checking that
+    `|C-AB| \le |A||B| rel\_tol` holds in the max norm,
+    using classical multiplication for reference.
+
 .. function:: void gr_mat_test_approx_mul_pos_entrywise_accurate(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx)
 
     Tests the given implementation of matrix multiplication for accuracy
diff --git a/src/gr_mat.h b/src/gr_mat.h
index d8f3cd282d..68b36147f1 100644
--- a/src/gr_mat.h
+++ b/src/gr_mat.h
@@ -302,6 +302,7 @@ void gr_mat_test_lu(gr_method_mat_lu_op lu_impl, flint_rand_t state, slong iters
 void gr_mat_test_det(gr_method_mat_unary_op_get_scalar det_impl, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
 void gr_mat_test_nonsingular_solve_tril(gr_method_mat_binary_op_with_flag solve_impl, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
 void gr_mat_test_nonsingular_solve_triu(gr_method_mat_binary_op_with_flag solve_impl, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
+void gr_mat_test_approx_mul_max_norm(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
 void gr_mat_test_approx_mul_pos_entrywise_accurate(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx);
 
 #ifdef __cplusplus
diff --git a/src/gr_mat/test_approx_mul.c b/src/gr_mat/test_approx_mul.c
index a491254a04..e3c3c69424 100644
--- a/src/gr_mat/test_approx_mul.c
+++ b/src/gr_mat/test_approx_mul.c
@@ -12,6 +12,121 @@
 #include "gr.h"
 #include "gr_mat.h"
 
+void gr_mat_test_approx_mul_max_norm(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx)
+{
+    slong iter;
+    gr_ctx_ptr given_ctx = ctx;
+
+    for (iter = 0; iter < iters; iter++)
+    {
+        gr_mat_t A, B, C, D, ERR;
+        gr_ptr err, amag, bmag, tol;
+        slong a, b, c;
+        int status = GR_SUCCESS;
+        gr_ctx_t my_ctx;
+        gr_ctx_ptr ctx;
+
+        if (given_ctx == NULL)
+        {
+            gr_ctx_init_random(my_ctx, state);
+            ctx = my_ctx;
+        }
+        else
+            ctx = given_ctx;
+
+        if (n_randint(state, 4) == 0)
+        {
+            a = b = c = n_randint(state, maxn);
+        }
+        else
+        {
+            a = n_randint(state, maxn);
+            b = n_randint(state, maxn);
+            c = n_randint(state, maxn);
+        }
+
+        gr_mat_init(A, a, b, ctx);
+        gr_mat_init(B, b, c, ctx);
+        gr_mat_init(C, a, c, ctx);
+        gr_mat_init(D, a, c, ctx);
+        gr_mat_init(ERR, a, c, ctx);
+        err = gr_heap_init(ctx);
+        amag = gr_heap_init(ctx);
+        bmag = gr_heap_init(ctx);
+        tol = gr_heap_init(ctx);
+
+        status |= gr_mat_randtest(A, state, ctx);
+        status |= gr_mat_randtest(B, state, ctx);
+        status |= gr_mat_entrywise_unary_op(A, (gr_method_unary_op) gr_abs, A, ctx);
+        status |= gr_mat_entrywise_unary_op(B, (gr_method_unary_op) gr_abs, B, ctx);
+
+        status |= gr_mat_randtest(C, state, ctx);
+        status |= gr_mat_randtest(D, state, ctx);
+
+        if (b == c && n_randint(state, 2))
+        {
+            status |= gr_mat_set(C, A, ctx);
+            status |= mul_impl(C, C, B, ctx);
+        }
+        else if (a == b && n_randint(state, 2))
+        {
+            status |= gr_mat_set(C, B, ctx);
+            status |= mul_impl(C, A, C, ctx);
+        }
+        else if (a == b && b == c && n_randint(state, 2))
+        {
+            status |= gr_mat_set(B, A, ctx);
+            status |= mul_impl(C, A, A, ctx);
+        }
+        else if (a == b && b == c && n_randint(state, 2))
+        {
+            status |= gr_mat_set(B, A, ctx);
+            status |= gr_mat_set(C, A, ctx);
+            status |= mul_impl(C, C, C, ctx);
+        }
+        else
+        {
+            status |= mul_impl(C, A, B, ctx);
+        }
+
+        status |= gr_mat_mul_classical(D, A, B, ctx);
+
+        status |= gr_mat_sub(ERR, C, D, ctx);
+
+        status |= gr_mat_norm_max(err, ERR, ctx);
+        status |= gr_mat_norm_max(amag, A, ctx);
+        status |= gr_mat_norm_max(bmag, B, ctx);
+        status |= gr_mul(tol, amag, bmag, ctx);
+        status |= gr_mul(tol, tol, rel_tol, ctx);
+
+        if (status == GR_SUCCESS && gr_le(err, tol, ctx) == T_FALSE)
+        {
+            flint_printf("FAIL:\n");
+            gr_ctx_println(ctx);
+            flint_printf("A:\n"); gr_mat_print(A, ctx); flint_printf("\n\n");
+            flint_printf("B:\n"); gr_mat_print(B, ctx); flint_printf("\n\n");
+            flint_printf("C:\n"); gr_mat_print(C, ctx); flint_printf("\n\n");
+            flint_printf("D:\n"); gr_mat_print(D, ctx); flint_printf("\n\n");
+            flint_printf("ERR:\n"); gr_mat_print(ERR, ctx); flint_printf("\n\n");
+            flint_printf("err:\n"); gr_println(err, ctx); flint_printf("\n\n");
+            flint_printf("tol:\n"); gr_println(tol, ctx); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        gr_mat_clear(A, ctx);
+        gr_mat_clear(B, ctx);
+        gr_mat_clear(C, ctx);
+        gr_mat_clear(D, ctx);
+        gr_heap_clear(err, ctx);
+        gr_heap_clear(amag, ctx);
+        gr_heap_clear(bmag, ctx);
+        gr_heap_clear(tol, ctx);
+
+        if (given_ctx == NULL)
+            gr_ctx_clear(ctx);
+    }
+}
+
 void gr_mat_test_approx_mul_pos_entrywise_accurate(gr_method_mat_binary_op mul_impl, gr_srcptr rel_tol, flint_rand_t state, slong iters, slong maxn, gr_ctx_t ctx)
 {
     slong iter;
diff --git a/src/nfloat/test/t-mat_mul.c b/src/nfloat/test/t-mat_mul.c
index 4ac640931b..abaab56e65 100644
--- a/src/nfloat/test/t-mat_mul.c
+++ b/src/nfloat/test/t-mat_mul.c
@@ -27,6 +27,42 @@ TEST_FUNCTION_START(mat_mul, state)
     slong iter;
     gr_ptr tol;
 
+    for (iter = 0; iter < 10 * flint_test_multiplier(); iter++)
+    {
+        if (n_randint(state, 5))
+            prec = FLINT_BITS * (1 + n_randint(state, 4));
+        else
+            prec = FLINT_BITS * (1 + n_randint(state, NFLOAT_MAX_LIMBS));
+
+        nfloat_ctx_init(ctx, prec, 0);
+
+        tol = gr_heap_init(ctx);
+        GR_MUST_SUCCEED(gr_one(tol, ctx));
+        GR_MUST_SUCCEED(gr_mul_2exp_si(tol, tol, -prec + 2, ctx));
+
+        gr_mat_test_approx_mul_max_norm(
+            (gr_method_mat_binary_op) _nfloat_mat_mul_waksman,
+            tol, state, (prec <= 256) ? 10 : 1, 10, ctx);
+
+        gr_mat_test_approx_mul_max_norm(
+            (gr_method_mat_binary_op) nfloat_mat_mul_block1,
+            tol, state, (prec <= 256) ? 10 : 1,
+                        (prec <= 256) ? 40 : 20, ctx);
+
+        gr_mat_test_approx_mul_max_norm(
+            (gr_method_mat_binary_op) _nfloat_mat_mul_fixed_classical,
+            tol, state, (prec <= 256) ? 10 : 1,
+                        (prec <= 256) ? 40 : 20, ctx);
+
+        if (n_randint(state, 4) == 0)
+            gr_mat_test_approx_mul_max_norm(
+                (gr_method_mat_binary_op) nfloat_mat_mul,
+                tol, state, 1, 120, ctx);
+
+        gr_heap_clear(tol, ctx);
+        gr_ctx_clear(ctx);
+    }
+
     for (iter = 0; iter < 100 * flint_test_multiplier(); iter++)
     {
         if (n_randint(state, 5))

From 79b7c89544d086b20059d2be40e73d0b8cef0738 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Mon, 15 Jul 2024 06:16:03 +0200
Subject: [PATCH 8/9] small perf fix; renaming

---
 doc/source/nfloat.rst       | 6 ++++--
 src/nfloat.h                | 4 ++--
 src/nfloat/mat_mul.c        | 6 +++---
 src/nfloat/test/t-mat_mul.c | 8 ++++----
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/doc/source/nfloat.rst b/doc/source/nfloat.rst
index 763d129f9a..d708a16c6a 100644
--- a/doc/source/nfloat.rst
+++ b/doc/source/nfloat.rst
@@ -317,11 +317,13 @@ code for reduced overhead.
 Matrix functions
 -------------------------------------------------------------------------------
 
-.. function:: int _nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
-              int _nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+.. function:: int nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+              int nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
               int nfloat_mat_mul_block(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, slong min_block_size, gr_ctx_t ctx)
               int nfloat_mat_mul(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
 
+    Different implementations of matrix multiplication.
+
 Internal functions
 -------------------------------------------------------------------------------
 
diff --git a/src/nfloat.h b/src/nfloat.h
index 4a5d61497a..78b764a213 100644
--- a/src/nfloat.h
+++ b/src/nfloat.h
@@ -453,8 +453,8 @@ int _nfloat_vec_submul_scalar(nfloat_ptr res, nfloat_srcptr x, slong len, nfloat
 int _nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx);
 int _nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx);
 
-int _nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
-int _nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
+int nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
+int nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
 int nfloat_mat_mul_block(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, slong min_block_size, gr_ctx_t ctx);
 int nfloat_mat_mul(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx);
 
diff --git a/src/nfloat/mat_mul.c b/src/nfloat/mat_mul.c
index b943a4c33d..57253ed8ed 100644
--- a/src/nfloat/mat_mul.c
+++ b/src/nfloat/mat_mul.c
@@ -396,7 +396,7 @@ _nfloat_mat_mul_fixed(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, int waksma
 
     extra_bits = Adelta + Bdelta + pad_top + pad_bot;
 
-    if (extra_bits > max_extra_bits)
+    if (extra_bits >= max_extra_bits)
         return gr_mat_mul_classical(C, A, B, ctx);
 
     Aexp = Amax + pad_top;
@@ -408,13 +408,13 @@ _nfloat_mat_mul_fixed(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, int waksma
 }
 
 int
-_nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+nfloat_mat_mul_fixed_classical(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
 {
     return _nfloat_mat_mul_fixed(C, A, B, 0, 100000, ctx);
 }
 
 int
-_nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
+nfloat_mat_mul_waksman(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, gr_ctx_t ctx)
 {
     return _nfloat_mat_mul_fixed(C, A, B, 1, 100000, ctx);
 }
diff --git a/src/nfloat/test/t-mat_mul.c b/src/nfloat/test/t-mat_mul.c
index abaab56e65..dad6ad625c 100644
--- a/src/nfloat/test/t-mat_mul.c
+++ b/src/nfloat/test/t-mat_mul.c
@@ -41,7 +41,7 @@ TEST_FUNCTION_START(mat_mul, state)
         GR_MUST_SUCCEED(gr_mul_2exp_si(tol, tol, -prec + 2, ctx));
 
         gr_mat_test_approx_mul_max_norm(
-            (gr_method_mat_binary_op) _nfloat_mat_mul_waksman,
+            (gr_method_mat_binary_op) nfloat_mat_mul_waksman,
             tol, state, (prec <= 256) ? 10 : 1, 10, ctx);
 
         gr_mat_test_approx_mul_max_norm(
@@ -50,7 +50,7 @@ TEST_FUNCTION_START(mat_mul, state)
                         (prec <= 256) ? 40 : 20, ctx);
 
         gr_mat_test_approx_mul_max_norm(
-            (gr_method_mat_binary_op) _nfloat_mat_mul_fixed_classical,
+            (gr_method_mat_binary_op) nfloat_mat_mul_fixed_classical,
             tol, state, (prec <= 256) ? 10 : 1,
                         (prec <= 256) ? 40 : 20, ctx);
 
@@ -77,7 +77,7 @@ TEST_FUNCTION_START(mat_mul, state)
         GR_MUST_SUCCEED(gr_mul_2exp_si(tol, tol, -prec + 6, ctx));
 
         gr_mat_test_approx_mul_pos_entrywise_accurate(
-            (gr_method_mat_binary_op) _nfloat_mat_mul_waksman,
+            (gr_method_mat_binary_op) nfloat_mat_mul_waksman,
             tol, state, (prec <= 256) ? 10 : 1, 10, ctx);
 
         gr_mat_test_approx_mul_pos_entrywise_accurate(
@@ -86,7 +86,7 @@ TEST_FUNCTION_START(mat_mul, state)
                         (prec <= 256) ? 40 : 20, ctx);
 
         gr_mat_test_approx_mul_pos_entrywise_accurate(
-            (gr_method_mat_binary_op) _nfloat_mat_mul_fixed_classical,
+            (gr_method_mat_binary_op) nfloat_mat_mul_fixed_classical,
             tol, state, (prec <= 256) ? 10 : 1,
                         (prec <= 256) ? 40 : 20, ctx);
 

From b2aeff61418f6ee8e4f2eaf4e71da31260ff8960 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Mon, 15 Jul 2024 07:38:21 +0200
Subject: [PATCH 9/9] bump threshold

---
 src/nfloat/mat_mul.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/nfloat/mat_mul.c b/src/nfloat/mat_mul.c
index 57253ed8ed..169722a3ec 100644
--- a/src/nfloat/mat_mul.c
+++ b/src/nfloat/mat_mul.c
@@ -854,7 +854,12 @@ nfloat_mat_mul_block(gr_mat_t C, const gr_mat_t A, const gr_mat_t B, slong min_b
 }
 
 /* Minimum precision for using fixed-point arithmetic */
-#define NFLOAT_MAT_MUL_FIXED_CUTOFF 192
+
+/* TODO: for *unsigned* matrices, there is a speedup already for
+   prec = 192. Consider inlining fixed-point additions/subtractions for
+   4 and 5 limbs to extend this to general matrices. */
+/* #define NFLOAT_MAT_MUL_FIXED_CUTOFF 192 */
+#define NFLOAT_MAT_MUL_FIXED_CUTOFF 320
 
 /* first cutoff:  classical -> fixed_classical
    second cutoff: fixed_classical -> waksman */