From 65a58a7ee31bd772c7de61427e08f8ee6cc9d258 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Mon, 22 Jan 2024 16:02:40 +0100
Subject: [PATCH 1/8] fmt AUTHORS

---
 AUTHORS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AUTHORS b/AUTHORS
index 6ef3b793e2..297c0f9f44 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -455,7 +455,7 @@ Mathieu Gouttenoire
 
   Primality testing for Gaussian integers.
 
-  github math-gout
+  github   math-gout
 
 Michael Abshoff
 

From 3d1f801b739c5c42bc881365d316bd87aa94feae Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Wed, 24 Jan 2024 23:40:11 +0100
Subject: [PATCH 2/8] faster fmpz dot products (WIP)

---
 src/fmpq_poly/exp_series.c                |   9 +-
 src/fmpq_poly/sin_cos_series.c            |   1 +
 src/fmpz_mat/mul.c                        |   2 +-
 src/fmpz_mat/mul_classical.c              |  65 +++-
 src/fmpz_mod_mat/mul_classical_threaded.c |  11 +
 src/fmpz_poly/inv_series.c                |   9 +-
 src/fmpz_poly/mul_classical.c             |  34 +-
 src/fmpz_poly/mullow_classical.c          |  39 ++-
 src/fmpz_poly/sqr_classical.c             |  39 ++-
 src/fmpz_poly/sqrlow_classical.c          |  43 ++-
 src/fmpz_vec.h                            |  12 +-
 src/fmpz_vec/dot.c                        | 382 +++++++++++++++++++++-
 src/gr/fmpz.c                             |  65 +---
 src/gr/fmpz_mod.c                         |  49 +--
 14 files changed, 554 insertions(+), 206 deletions(-)

diff --git a/src/fmpq_poly/exp_series.c b/src/fmpq_poly/exp_series.c
index 88f8b3d142..6d4b62a85a 100644
--- a/src/fmpq_poly/exp_series.c
+++ b/src/fmpq_poly/exp_series.c
@@ -40,7 +40,7 @@ _fmpq_poly_exp_series_basecase_deriv(fmpz * B, fmpz_t Bden,
     const fmpz * Aprime, const fmpz_t Aden, slong Alen, slong n)
 {
     fmpz_t t, u;
-    slong j, k;
+    slong k;
 
     Alen = FLINT_MIN(Alen, n);
 
@@ -55,11 +55,8 @@ _fmpq_poly_exp_series_basecase_deriv(fmpz * B, fmpz_t Bden,
 
     for (k = 1; k < n; k++)
     {
-        fmpz_mul(t, Aprime, B + k - 1);
-
-        for (j = 2; j < FLINT_MIN(Alen, k + 1); j++)
-            fmpz_addmul(t, Aprime + j - 1, B + k - j);
-
+        slong l = FLINT_MIN(Alen - 1, k);
+        _fmpz_vec_dot_general(t, NULL, 0, Aprime, B + k - l, 1, l);
         fmpz_mul_ui(u, Aden, k);
         fmpz_divexact(B + k, t, u);
     }
diff --git a/src/fmpq_poly/sin_cos_series.c b/src/fmpq_poly/sin_cos_series.c
index ceace1fec0..796783eee2 100644
--- a/src/fmpq_poly/sin_cos_series.c
+++ b/src/fmpq_poly/sin_cos_series.c
@@ -61,6 +61,7 @@ _fmpq_poly_sin_cos_series_basecase_can(fmpz * S, fmpz_t Sden,
         fmpz_zero(t);
         fmpz_zero(u);
 
+        /* todo: precompute A[j] * j, use dot products */
         for (j = 1; j < FLINT_MIN(Alen, k + 1); j++)
         {
             fmpz_mul_ui(v, A + j, j);
diff --git a/src/fmpz_mat/mul.c b/src/fmpz_mat/mul.c
index a2cf467ac7..bbdfa2afb5 100644
--- a/src/fmpz_mat/mul.c
+++ b/src/fmpz_mat/mul.c
@@ -279,6 +279,6 @@ fmpz_mat_mul(fmpz_mat_t C, const fmpz_mat_t A, const fmpz_mat_t B)
         else if (abits >= 500 && bbits >= 500 && dim >= 8)  /* tuning param */
             fmpz_mat_mul_strassen(C, A, B);
         else
-            fmpz_mat_mul_classical_inline(C, A, B);
+            fmpz_mat_mul_classical(C, A, B);
     }
 }
diff --git a/src/fmpz_mat/mul_classical.c b/src/fmpz_mat/mul_classical.c
index 299ac130d3..af6572091a 100644
--- a/src/fmpz_mat/mul_classical.c
+++ b/src/fmpz_mat/mul_classical.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2010,2011 Fredrik Johansson
+    Copyright (C) 2010, 2011, 2024 Fredrik Johansson
 
     This file is part of FLINT.
 
@@ -10,38 +10,71 @@
 */
 
 #include "fmpz.h"
+#include "fmpz_vec.h"
 #include "fmpz_mat.h"
 
 void
 fmpz_mat_mul_classical(fmpz_mat_t C, const fmpz_mat_t A, const fmpz_mat_t B)
 {
-    slong ar, bc, br;
-    slong i, j, k;
+    slong ar, br, bc, i, j;
 
-    ar = A->r;
-    br = B->r;
-    bc = B->c;
+    ar = fmpz_mat_nrows(A);
+    br = fmpz_mat_nrows(B);
+    bc = fmpz_mat_ncols(B);
 
-    if (br == 0)
+    if (ar == 0 || br == 0 || bc == 0)
     {
         fmpz_mat_zero(C);
         return;
     }
 
-    for (i = 0; i < ar; i++)
+    if (br == 1)
     {
-        for (j = 0; j < bc; j++)
+        for (i = 0; i < ar; i++)
         {
-            fmpz_mul(fmpz_mat_entry(C, i, j),
-                     fmpz_mat_entry(A, i, 0),
-                     fmpz_mat_entry(B, 0, j));
+            for (j = 0; j < bc; j++)
+            {
+                fmpz_mul(fmpz_mat_entry(C, i, j),
+                                 fmpz_mat_entry(A, i, 0),
+                                 fmpz_mat_entry(B, 0, j));
+            }
+        }
+    }
+    else if (br == 2)
+    {
+        for (i = 0; i < ar; i++)
+        {
+            for (j = 0; j < bc; j++)
+            {
+                fmpz_fmma(fmpz_mat_entry(C, i, j),
+                                 fmpz_mat_entry(A, i, 0),
+                                 fmpz_mat_entry(B, 0, j),
+                                 fmpz_mat_entry(A, i, 1),
+                                 fmpz_mat_entry(B, 1, j));
+            }
+        }
+    }
+    else
+    {
+        fmpz * tmp;
+        TMP_INIT;
+
+        TMP_START;
+        tmp = TMP_ALLOC(sizeof(fmpz) * br * bc);
 
-            for (k = 1; k < br; k++)
+        for (i = 0; i < br; i++)
+            for (j = 0; j < bc; j++)
+                tmp[j * br + i] = *fmpz_mat_entry(B, i, j);
+
+        for (i = 0; i < ar; i++)
+        {
+            for (j = 0; j < bc; j++)
             {
-                fmpz_addmul(fmpz_mat_entry(C, i, j),
-                            fmpz_mat_entry(A, i, k),
-                            fmpz_mat_entry(B, k, j));
+                _fmpz_vec_dot_general(fmpz_mat_entry(C, i, j),
+                    NULL, 0, fmpz_mat_entry(A, i, 0), tmp + j * br, 0, br);
             }
         }
+
+        TMP_END;
     }
 }
diff --git a/src/fmpz_mod_mat/mul_classical_threaded.c b/src/fmpz_mod_mat/mul_classical_threaded.c
index dcab679382..c2679b9036 100644
--- a/src/fmpz_mod_mat/mul_classical_threaded.c
+++ b/src/fmpz_mod_mat/mul_classical_threaded.c
@@ -24,6 +24,17 @@ with op = 1, computes D = C + A*B
 with op = -1, computes D = C - A*B
 */
 
+static inline void
+_fmpz_vec_dot_ptr(fmpz_t c, const fmpz * vec1, fmpz ** const vec2,
+                                                       slong offset, slong len)
+{
+    slong i;
+
+    fmpz_zero(c);
+    for (i = 0; i < len; i++)
+        fmpz_addmul(c, vec1 + i, vec2[i] + offset);
+}
+
 static inline void
 _fmpz_mod_mat_addmul_basic_op(fmpz ** D, fmpz ** const C, fmpz ** const A,
                fmpz ** const B, slong m, slong k, slong n, int op, fmpz_t p)
diff --git a/src/fmpz_poly/inv_series.c b/src/fmpz_poly/inv_series.c
index e9affd9771..eae17bf875 100644
--- a/src/fmpz_poly/inv_series.c
+++ b/src/fmpz_poly/inv_series.c
@@ -220,13 +220,8 @@ _fmpz_poly_inv_series_basecase(fmpz * Qinv, const fmpz * Q, slong Qlen, slong n)
             }
             else
             {
-                fmpz_mul(Qinv + i, Q + 1, Qinv + i - 1);
-
-                for (j = 2; j < FLINT_MIN(i + 1, Qlen); j++)
-                    fmpz_addmul(Qinv + i, Q + j, Qinv + i - j);
-
-                if (neg)
-                    fmpz_neg(Qinv + i, Qinv + i);
+                slong l = FLINT_MIN(i, Qlen - 1);
+                _fmpz_vec_dot_general(Qinv + i, NULL, neg, Q + 1, Qinv + i - l, 1, l);
             }
         }
 
diff --git a/src/fmpz_poly/mul_classical.c b/src/fmpz_poly/mul_classical.c
index 844ddc135d..71d5e36fc1 100644
--- a/src/fmpz_poly/mul_classical.c
+++ b/src/fmpz_poly/mul_classical.c
@@ -1,5 +1,6 @@
 /*
     Copyright (C) 2008, 2009 William Hart
+    Copyright (C) 2024 Fredrik Johansson
 
     This file is part of FLINT.
 
@@ -18,26 +19,37 @@ void
 _fmpz_poly_mul_classical(fmpz * res, const fmpz * poly1,
                          slong len1, const fmpz * poly2, slong len2)
 {
-    if (len1 == 1 && len2 == 1) /* Special case if the length of both inputs is 1 */
+    slong i, top1, top2;
+
+    if (len1 == 1 && len2 == 1)
     {
         fmpz_mul(res, poly1, poly2);
+        return;
     }
-    else                        /* Ordinary case */
+
+    if (len1 == 1)
     {
-        slong i;
+        _fmpz_vec_scalar_mul_fmpz(res, poly2, len2, poly1);
+        return;
+    }
 
-        /* Set res[i] = poly1[i]*poly2[0] */
+    if (len2 == 1)
+    {
         _fmpz_vec_scalar_mul_fmpz(res, poly1, len1, poly2);
+        return;
+    }
 
-        /* Set res[i+len1-1] = in1[len1-1]*in2[i] */
-        _fmpz_vec_scalar_mul_fmpz(res + len1, poly2 + 1, len2 - 1,
-                                  poly1 + len1 - 1);
+    fmpz_mul(res, poly1, poly2);
 
-        /* out[i+j] += in1[i]*in2[j] */
-        for (i = 0; i < len1 - 1; i++)
-            _fmpz_vec_scalar_addmul_fmpz(res + i + 1, poly2 + 1, len2 - 1,
-                                         poly1 + i);
+    for (i = 1; i < len1 + len2 - 2; i++)
+    {
+        top1 = FLINT_MIN(len1 - 1, i);
+        top2 = FLINT_MIN(len2 - 1, i);
+
+        _fmpz_vec_dot_general(res + i, NULL, 0, poly1 + i - top2, poly2 + i - top1, 1, top1 + top2 - i + 1);
     }
+
+    fmpz_mul(res + len1 + len2 - 2, poly1 + len1 - 1, poly2 + len2 - 1);
 }
 
 void
diff --git a/src/fmpz_poly/mullow_classical.c b/src/fmpz_poly/mullow_classical.c
index 94553b1bc1..bcfa94e3e3 100644
--- a/src/fmpz_poly/mullow_classical.c
+++ b/src/fmpz_poly/mullow_classical.c
@@ -1,5 +1,6 @@
 /*
     Copyright (C) 2008, 2009 William Hart
+    Copyright (C) 2024 Fredrik Johansson
 
     This file is part of FLINT.
 
@@ -20,27 +21,37 @@ void
 _fmpz_poly_mullow_classical(fmpz * res, const fmpz * poly1, slong len1,
                                         const fmpz * poly2, slong len2, slong n)
 {
-    if ((len1 == 1 && len2 == 1) || n == 1) /* Special case if the length of output is 1 */
+    slong i, top1, top2;
+
+    len1 = FLINT_MIN(len1, n);
+    len2 = FLINT_MIN(len2, n);
+
+    if (n == 1)
     {
         fmpz_mul(res, poly1, poly2);
+        return;
     }
-    else                        /* Ordinary case */
+
+    if (len1 == 1)
     {
-        slong i;
+        _fmpz_vec_scalar_mul_fmpz(res, poly2, n, poly1);
+        return;
+    }
 
-        /* Set res[i] = poly1[i]*poly2[0] */
-        _fmpz_vec_scalar_mul_fmpz(res, poly1, FLINT_MIN(len1, n), poly2);
+    if (len2 == 1)
+    {
+        _fmpz_vec_scalar_mul_fmpz(res, poly1, n, poly2);
+        return;
+    }
 
-        /* Set res[i+len1-1] = in1[len1-1]*in2[i] */
-        if (n > len1)
-            _fmpz_vec_scalar_mul_fmpz(res + len1, poly2 + 1, n - len1,
-                                      poly1 + len1 - 1);
+    fmpz_mul(res, poly1, poly2);
+
+    for (i = 1; i < n; i++)
+    {
+        top1 = FLINT_MIN(len1 - 1, i);
+        top2 = FLINT_MIN(len2 - 1, i);
 
-        /* out[i+j] += in1[i]*in2[j] */
-        for (i = 0; i < FLINT_MIN(len1, n) - 1; i++)
-            _fmpz_vec_scalar_addmul_fmpz(res + i + 1, poly2 + 1,
-                                         FLINT_MIN(len2, n - i) - 1,
-                                         poly1 + i);
+        _fmpz_vec_dot_general(res + i, NULL, 0, poly1 + i - top2, poly2 + i - top1, 1, top1 + top2 - i + 1);
     }
 }
 
diff --git a/src/fmpz_poly/sqr_classical.c b/src/fmpz_poly/sqr_classical.c
index d9b112b6ae..2352dfd719 100644
--- a/src/fmpz_poly/sqr_classical.c
+++ b/src/fmpz_poly/sqr_classical.c
@@ -1,6 +1,7 @@
 /*
     Copyright (C) 2008, 2009 William Hart
     Copyright (C) 2011 Sebastian Pancratz
+    Copyright (C) 2024 Fredrik Johansson
 
     This file is part of FLINT.
 
@@ -15,29 +16,39 @@
 #include "fmpz_poly.h"
 
 /* Assumes len > 0. */
-void _fmpz_poly_sqr_classical(fmpz *rop, const fmpz *op, slong len)
+void _fmpz_poly_sqr_classical(fmpz *res, const fmpz *op, slong len)
 {
-    if (len == 1)  /* Special case */
+    slong i, start, stop;
+
+    if (len == 1)
     {
-        fmpz_mul(rop, op, op);
+        fmpz_mul(res, op, op);
+        return;
     }
-    else   /* Ordinary case */
-    {
-        slong i;
 
-        _fmpz_vec_scalar_mul_fmpz(rop, op, len, op);
+    fmpz_mul(res, op, op);
+    fmpz_mul(res + 1, op, op + 1);
+    fmpz_mul_2exp(res + 1, res + 1, 1);
 
-        _fmpz_vec_scalar_mul_fmpz(rop + len, op + 1, len - 1, op + len - 1);
+    for (i = 2; i < 2 * len - 3; i++)
+    {
+        start = FLINT_MAX(0, i - len + 1);
+        stop = FLINT_MIN(len - 1, (i + 1) / 2 - 1);
 
-        for (i = 1; i < len - 1; i++)
-            _fmpz_vec_scalar_addmul_fmpz(rop + i + 1, op + 1, i - 1, op + i);
+        _fmpz_vec_dot_general(res + i, NULL, 0, op + start, op + i - stop, 1, stop - start + 1);
+        fmpz_mul_2exp(res + i, res + i, 1);
 
-        for (i = 1; i < 2 * len - 2; i++)
-            fmpz_mul_ui(rop + i, rop + i, 2);
+        if (i % 2 == 0 && i / 2 < len)
+            fmpz_addmul(res + i, op + i / 2, op + i / 2);
+    }
 
-        for (i = 1; i < len - 1; i++)
-            fmpz_addmul(rop + 2 * i, op + i, op + i);
+    if (len > 2)
+    {
+        fmpz_mul(res + 2 * len - 3, op + len - 1, op + len - 2);
+        fmpz_mul_2exp(res + 2 * len - 3, res + 2 * len - 3, 1);
     }
+
+    fmpz_mul(res + 2 * len - 2, op + len - 1, op + len - 1);
 }
 
 void fmpz_poly_sqr_classical(fmpz_poly_t rop, const fmpz_poly_t op)
diff --git a/src/fmpz_poly/sqrlow_classical.c b/src/fmpz_poly/sqrlow_classical.c
index adf9f1afef..c55390fa02 100644
--- a/src/fmpz_poly/sqrlow_classical.c
+++ b/src/fmpz_poly/sqrlow_classical.c
@@ -1,5 +1,6 @@
 /*
     Copyright (C) 2011 Sebastian Pancratz
+    Copyright (C) 2024 Fredrik Johansson
 
     This file is part of FLINT.
 
@@ -16,30 +17,42 @@
 /*
     Assumes len > 0 and 0 < n <= 2 * len - 1.
  */
-void _fmpz_poly_sqrlow_classical(fmpz *rop, const fmpz *op, slong len, slong n)
+void _fmpz_poly_sqrlow_classical(fmpz *res, const fmpz *op, slong len, slong n)
 {
-    if (len == 1 || n == 1)  /* Special case */
+    slong i, start, stop;
+
+    len = FLINT_MIN(len, n);
+
+    if (n == 1)
     {
-        fmpz_mul(rop, op, op);
+        fmpz_mul(res, op, op);
+        return;
     }
-    else   /* Ordinary case */
-    {
-        slong i;
 
-        _fmpz_vec_scalar_mul_fmpz(rop, op, FLINT_MIN(len, n), op);
+    fmpz_mul(res, op, op);
+    fmpz_mul(res + 1, op, op + 1);
+    fmpz_mul_2exp(res + 1, res + 1, 1);
 
-        _fmpz_vec_scalar_mul_fmpz(rop + len, op + 1, n - len, op + len - 1);
+    for (i = 2; i < FLINT_MIN(n, 2 * len - 3); i++)
+    {
+        start = FLINT_MAX(0, i - len + 1);
+        stop = FLINT_MIN(len - 1, (i + 1) / 2 - 1);
 
-        for (i = 1; i < len - 1; i++)
-            _fmpz_vec_scalar_addmul_fmpz(rop + i + 1,
-                op + 1, FLINT_MIN(i - 1, n - (i + 1)), op + i);
+        _fmpz_vec_dot_general(res + i, NULL, 0, op + start, op + i - stop, 1, stop - start + 1);
+        fmpz_mul_2exp(res + i, res + i, 1);
 
-        for (i = 1; i < FLINT_MIN(2 * len - 2, n); i++)
-            fmpz_mul_ui(rop + i, rop + i, 2);
+        if (i % 2 == 0 && i / 2 < len)
+            fmpz_addmul(res + i, op + i / 2, op + i / 2);
+    }
 
-        for (i = 1; i < FLINT_MIN(len - 1, (n + 1) / 2); i++)
-            fmpz_addmul(rop + 2 * i, op + i, op + i);
+    if (len > 2 && n >= 2 * len - 2)
+    {
+        fmpz_mul(res + 2 * len - 3, op + len - 1, op + len - 2);
+        fmpz_mul_2exp(res + 2 * len - 3, res + 2 * len - 3, 1);
     }
+
+    if (n >= 2 * len - 1)
+        fmpz_mul(res + 2 * len - 2, op + len - 1, op + len - 1);
 }
 
 void
diff --git a/src/fmpz_vec.h b/src/fmpz_vec.h
index 8250567429..4606e43c60 100644
--- a/src/fmpz_vec.h
+++ b/src/fmpz_vec.h
@@ -256,10 +256,16 @@ void _fmpz_vec_lcm(fmpz_t res, const fmpz * vec, slong len);
 
 /*  Dot product  *************************************************************/
 
-void _fmpz_vec_dot(fmpz_t res, const fmpz * vec1, const fmpz * vec2, slong len2);
+void _fmpz_vec_dot_general_naive(fmpz_t res, const fmpz_t initial, int subtract,
+    const fmpz * a, const fmpz * b, int reverse, slong len);
+void _fmpz_vec_dot_general(fmpz_t res, const fmpz_t initial, int subtract,
+    const fmpz * a, const fmpz * b, int reverse, slong len);
 
-void _fmpz_vec_dot_ptr(fmpz_t c, const fmpz * vec1,
-		                 fmpz ** const vec2, slong offset, slong len);
+FMPZ_VEC_INLINE
+void _fmpz_vec_dot(fmpz_t res, const fmpz * vec1, const fmpz * vec2, slong len2)
+{
+    _fmpz_vec_dot_general(res, NULL, 0, vec1, vec2, 0, len2);
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/fmpz_vec/dot.c b/src/fmpz_vec/dot.c
index 5c0e1ca75a..6ddbb23ce7 100644
--- a/src/fmpz_vec/dot.c
+++ b/src/fmpz_vec/dot.c
@@ -1,7 +1,5 @@
 /*
-    Copyright (C) 2010, 2020 William Hart
-    Copyright (C) 2010, 2011 Fredrik Johansson
-    Copyright (C) 2014 Abhinav Baid
+    Copyright (C) 2024 Fredrik Johansson
 
     This file is part of FLINT.
 
@@ -11,26 +9,388 @@
     (at your option) any later version.  See <https://www.gnu.org/licenses/>.
 */
 
+#include "mpn_extras.h"
 #include "fmpz.h"
+#include "fmpz_extras.h"
 #include "fmpz_vec.h"
 
 void
-_fmpz_vec_dot(fmpz_t res, const fmpz * vec1, const fmpz * vec2, slong len2)
+_fmpz_vec_dot_general_naive(fmpz_t res, const fmpz_t initial,
+    int subtract, const fmpz * a, const fmpz * b, int reverse, slong len)
 {
     slong i;
 
-    fmpz_zero(res);
-    for (i = 0; i < len2; i++)
-        fmpz_addmul(res, vec1 + i, vec2 + i);
+    if (initial == NULL)
+    {
+        fmpz_mul(res, a, reverse ? b + len - 1 : b);
+
+        if (subtract)
+        {
+            fmpz_neg(res, res);
+            for (i = 1; i < len; i++)
+                fmpz_submul(res, a + i, reverse ? b + len - 1 - i : b + i);
+        }
+        else
+        {
+            for (i = 1; i < len; i++)
+                fmpz_addmul(res, a + i, reverse ? b + len - 1 - i : b + i);
+        }
+    }
+    else
+    {
+        if (res != initial)
+            fmpz_set(res, initial);
+
+        if (subtract)
+            for(i = 0; i < len; i++)
+                fmpz_submul(res, a + i, reverse ? b + len - 1 - i : b + i);
+        else
+            for(i = 0; i < len; i++)
+                fmpz_addmul(res, a + i, reverse ? b + len - 1 - i : b + i);
+    }
+}
+
+#define INITIAL_ALLOC 64
+
+#define FMPZ_GET_MPN(xptr, xn, xneg, xtmplimb, x) \
+    if (!COEFF_IS_MPZ(x)) \
+    { \
+        (xtmplimb) = FLINT_ABS(x); \
+        (xptr) = &(xtmplimb); \
+        (xn) = 1; \
+        (xneg) = ((x) < 0); \
+    } \
+    else \
+    { \
+        __mpz_struct * __z1 = COEFF_TO_PTR(x); \
+        (xptr) = __z1->_mp_d; \
+        (xn) = FLINT_ABS(__z1->_mp_size); \
+        (xneg) = (__z1->_mp_size < 0); \
+    }
+
+/* (s,sn) = (a,an) + (b,bn). Allows an == 0 but not bn == 0. */
+#define MPN_ADD(s, sn, a, an, b, bn) \
+    do { \
+        if ((an) == 0) \
+        { \
+            FLINT_SWAP(mp_ptr, s, b); \
+            (sn) = (bn); \
+        } \
+        else if ((an) >= (bn)) \
+        { \
+            mp_limb_t __cy; \
+            (s)[(an)] = __cy = mpn_add((s), (a), (an), (b), (bn)); \
+            (sn) = (an) + (__cy != 0); \
+        } \
+        else \
+        { \
+            mp_limb_t __cy; \
+            (s)[(bn)] = __cy = mpn_add((s), (b), (bn), (a), (an)); \
+            (sn) = (bn) + (__cy != 0); \
+        } \
+    } while (0)
+
+/* (s,sn) = (s,sn) + (a,an) * b. Allows sn == 0 but not an == 0. */
+#define MPN_ADDMUL_1(s, sn, a, an, b) \
+    do { \
+        mp_limb_t __cy; \
+        if ((sn) >= (an)) \
+        { \
+            FLINT_ASSERT((an) != 0); \
+            __cy = mpn_addmul_1((s), (a), (an), (b)); \
+            if ((sn) > (an)) \
+                __cy = mpn_add_1((s) + (an), (s) + (an), (sn) - (an), __cy); \
+            (s)[(sn)] = __cy; \
+            (sn) += (__cy != 0); \
+        } \
+        else \
+        { \
+            (s)[(an)] = mpn_mul_1((s) + (sn), (a) + (sn), (an) - (sn), (b)); \
+            if ((sn) != 0) \
+            { \
+                __cy = mpn_addmul_1((s), (a), (sn), (b)); \
+                (s)[(an)] += mpn_add_1((s) + (sn), (s) + (sn), (an) - (sn), __cy); \
+            } \
+            (sn) = (an) + ((s)[(an)] != 0); \
+        } \
+    } while (0) \
+
+
+FLINT_STATIC_NOINLINE
+void _fmpz_set_mpn(fmpz_t res, mp_srcptr x, mp_size_t xn, int neg)
+{
+    if (xn <= 1 && x[0] <= COEFF_MAX)
+    {
+        _fmpz_demote(res);
+        *res = neg ? -(slong) x[0] : x[0];
+    }
+    else
+    {
+        fmpz_set_mpn_large(res, x, xn, neg);
+    }
 }
 
 void
-_fmpz_vec_dot_ptr(fmpz_t c, const fmpz * vec1, fmpz ** const vec2,
-                                                       slong offset, slong len)
+_fmpz_vec_dot_general(fmpz_t res, const fmpz_t initial, int subtract,
+            const fmpz * a, const fmpz * b, int reverse, slong len)
 {
+    mp_limb_t tmp1[INITIAL_ALLOC + 2];
+    mp_limb_t tmp2[INITIAL_ALLOC + 2];
+    mp_limb_t tmp3[INITIAL_ALLOC + 2];
+    mp_size_t alloc = INITIAL_ALLOC;
+    mp_size_t new_alloc;
+
+    /* We maintain separate sums for small terms, large positive terms,
+       and large negative terms, the idea being to have fewer
+       adjustments in the main loop in exchange for some added
+       complexity combining things in the end. Should profile
+       alternative strategies. */
+    mp_limb_t s0 = 0, s1 = 0, s2 = 0;
+    mp_ptr neg = tmp1;
+    mp_ptr pos = tmp2;
+    mp_size_t posn = 0, negn = 0;
+
+    /* Temporary space for products. */
+    mp_ptr t = tmp3;
+    mp_size_t tn;
+
+    mp_ptr tmp_heap = NULL;
+
     slong i;
 
-    fmpz_zero(c);
+    if (len <= 1 && initial == NULL)
+    {
+        if (initial == NULL)
+        {
+            if (len == 1)
+            {
+                fmpz_mul(res, a, b);
+                if (subtract)
+                    fmpz_neg(res, res);
+            }
+            else
+                fmpz_zero(res);
+        }
+        else
+        {
+            if (res != initial)
+                fmpz_set(res, initial);
+
+            if (subtract)
+            {
+                if (len == 1)
+                    fmpz_submul(res, a, b);
+            }
+            else
+            {
+                if (len == 1)
+                    fmpz_addmul(res, a, b);
+            }
+        }
+        return;
+    }
+
+    if (initial != NULL)
+    {
+        fmpz ca;
+        mp_limb_t atmp;
+        mp_srcptr ap;
+        mp_size_t an;
+        int aneg;
+
+        ca = *initial;
+        FMPZ_GET_MPN(ap, an, aneg, atmp, ca);
+
+        if (an <= 2)
+        {
+            s0 = ap[0];
+            if (an == 2)
+                s1 = ap[1];
+
+            if (aneg ^ subtract)
+                sub_dddmmmsss(s2, s1, s0, 0, 0, 0, 0, s1, s0);
+        }
+        else
+        {
+            if (an > INITIAL_ALLOC)
+            {
+                new_alloc = an + 4;
+
+                tmp_heap = flint_malloc(3 * (new_alloc + 2) * sizeof(mp_limb_t));
+
+                t = tmp_heap;
+                pos = t + (new_alloc + 2);
+                neg = pos + (new_alloc + 2);
+
+                alloc = new_alloc;
+            }
+
+            if (aneg ^ subtract)
+            {
+                flint_mpn_copyi(neg, ap, an);
+                negn = an;
+            }
+            else
+            {
+                flint_mpn_copyi(pos, ap, an);
+                posn = an;
+            }
+        }
+    }
+
     for (i = 0; i < len; i++)
-        fmpz_addmul(c, vec1 + i, vec2[i] + offset);
+    {
+        fmpz ca, cb;
+        mp_limb_t atmp, btmp;
+        mp_srcptr ap, bp;
+        mp_size_t an, bn;
+        mp_limb_t cy;
+        int aneg, bneg;
+
+        ca = a[i];
+        if (ca == 0)
+            continue;
+
+        cb = reverse ? b[len - 1 - i] : b[i];
+        if (cb == 0)
+            continue;
+
+        if (!COEFF_IS_MPZ(ca) && !COEFF_IS_MPZ(cb))
+        {
+            mp_limb_t hi, lo;
+            smul_ppmm(hi, lo, ca, cb);
+            add_sssaaaaaa(s2, s1, s0, s2, s1, s0, FLINT_SIGN_EXT(hi), hi, lo);
+            continue;
+        }
+
+        FMPZ_GET_MPN(ap, an, aneg, atmp, ca);
+        FMPZ_GET_MPN(bp, bn, bneg, btmp, cb);
+        tn = an + bn;
+
+        if (tn > alloc)
+        {
+            mp_ptr p1, p2, p3;
+
+            new_alloc = FLINT_MAX(3 * alloc / 2, tn + 4);
+
+            p1 = flint_malloc(3 * (new_alloc + 2) * sizeof(mp_limb_t));
+            p2 = p1 + (new_alloc + 2);
+            p3 = p2 + (new_alloc + 2);
+
+            flint_mpn_copyi(p2, pos, posn);
+            flint_mpn_copyi(p3, neg, negn);
+            t = p1;
+            pos = p2;
+            neg = p3;
+
+            FLINT_SWAP(mp_ptr, tmp_heap, p1);
+
+            if (p1 != NULL)
+                flint_free(p1);
+
+            alloc = new_alloc;
+        }
+
+        if (an < bn)
+        {
+            FLINT_SWAP(mp_srcptr, ap, bp);
+            FLINT_SWAP(mp_size_t, an, bn);
+        }
+
+        if (bn == 1)
+        {
+            mp_limb_t b0 = bp[0];
+
+            if (aneg ^ bneg)
+                MPN_ADDMUL_1(neg, negn, ap, an, b0);
+            else
+                MPN_ADDMUL_1(pos, posn, ap, an, b0);
+            continue;
+        }
+
+        if (ap == bp && an == bn)
+        {
+            flint_mpn_sqr(t, ap, an);
+            cy = t[tn - 1];
+        }
+        else
+        {
+            cy = flint_mpn_mul(t, ap, an, bp, bn);
+        }
+
+        tn -= (cy == 0);
+
+        if (aneg ^ bneg)
+            MPN_ADD(neg, negn, neg, negn, t, tn);
+        else
+            MPN_ADD(pos, posn, pos, posn, t, tn);
+    }
+
+    /* There are only small terms. */
+    if (posn == 0 && negn == 0)
+    {
+        if (subtract)
+            sub_dddmmmsss(s2, s1, s0, 0, 0, 0, s2, s1, s0);
+
+        fmpz_set_signed_uiuiui(res, s2, s1, s0);
+        return;
+    }
+
+    /* Add small terms to large ones. */
+    if (((slong) s2 >= WORD(0)))
+    {
+        t[2] = s2;
+        t[1] = s1;
+        t[0] = s0;
+        MPN_ADD(pos, posn, pos, posn, t, 3);
+    }
+    else
+    {
+        sub_dddmmmsss(t[2], t[1], t[0], 0, 0, 0, s2, s1, s0);
+        MPN_ADD(neg, negn, neg, negn, t, 3);
+    }
+
+    MPN_NORM(pos, posn);
+    MPN_NORM(neg, negn);
+
+    if (negn == 0)
+    {
+        _fmpz_set_mpn(res, pos, posn, 0 ^ subtract);
+    }
+    else if (posn == 0)
+    {
+        _fmpz_set_mpn(res, neg, negn, 1 ^ subtract);
+    }
+    else
+    {
+        /* Do subtraction */
+        int tneg = 0;
+
+        if (posn > negn)
+        {
+            tn = posn;
+        }
+        else if (negn > posn)
+        {
+            tn = negn;
+            tneg = 1;
+        }
+        else if (posn != 0)
+        {
+            tn = posn;
+            if (mpn_cmp(pos, neg, tn) < 0)
+                tneg = 1;
+        }
+
+        if (tneg)
+            mpn_sub(t, neg, negn, pos, posn);
+        else
+            mpn_sub(t, pos, posn, neg, negn);
+
+        MPN_NORM(t, tn);
+        _fmpz_set_mpn(res, t, tn, tneg ^ subtract);
+    }
+
+    if (tmp_heap != NULL)
+        flint_free(tmp_heap);
 }
diff --git a/src/gr/fmpz.c b/src/gr/fmpz.c
index 80ec5dbb1c..d24743ce57 100644
--- a/src/gr/fmpz.c
+++ b/src/gr/fmpz.c
@@ -15,6 +15,7 @@
 #include "fmpz_factor.h"
 #include "fmpz_poly.h"
 #include "fmpz_poly_factor.h"
+#include "fmpz_vec.h"
 #include "fmpz_mat.h"
 #include "fmpq.h"
 #include "gr.h"
@@ -917,74 +918,14 @@ _gr_fmpz_vec_sum(fmpz_t res, const fmpz * vec, slong len, gr_ctx_t ctx)
 int
 _gr_fmpz_vec_dot(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * vec1, const fmpz * vec2, slong len, gr_ctx_t ctx)
 {
-    slong i;
-
-    if (len <= 0)
-    {
-        if (initial == NULL)
-            fmpz_zero(res);
-        else
-            fmpz_set(res, initial);
-        return GR_SUCCESS;
-    }
-
-    if (initial == NULL)
-    {
-        fmpz_mul(res, vec1, vec2);
-    }
-    else
-    {
-        if (subtract)
-            fmpz_neg(res, initial);
-        else
-            fmpz_set(res, initial);
-
-        fmpz_addmul(res, vec1, vec2);
-    }
-
-    for (i = 1; i < len; i++)
-        fmpz_addmul(res, vec1 + i, vec2 + i);
-
-    if (subtract)
-        fmpz_neg(res, res);
-
+    _fmpz_vec_dot_general(res, initial, subtract, vec1, vec2, 0, len);
     return GR_SUCCESS;
 }
 
 int
 _gr_fmpz_vec_dot_rev(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * vec1, const fmpz * vec2, slong len, gr_ctx_t ctx)
 {
-    slong i;
-
-    if (len <= 0)
-    {
-        if (initial == NULL)
-            fmpz_zero(res);
-        else
-            fmpz_set(res, initial);
-        return GR_SUCCESS;
-    }
-
-    if (initial == NULL)
-    {
-        fmpz_mul(res, vec1, vec2 + len - 1);
-    }
-    else
-    {
-        if (subtract)
-            fmpz_neg(res, initial);
-        else
-            fmpz_set(res, initial);
-
-        fmpz_addmul(res, vec1, vec2 + len - 1);
-    }
-
-    for (i = 1; i < len; i++)
-        fmpz_addmul(res, vec1 + i, vec2 + len - 1 - i);
-
-    if (subtract)
-        fmpz_neg(res, res);
-
+    _fmpz_vec_dot_general(res, initial, subtract, vec1, vec2, 1, len);
     return GR_SUCCESS;
 }
 
diff --git a/src/gr/fmpz_mod.c b/src/gr/fmpz_mod.c
index 99101bc6d6..58e3b97e3a 100644
--- a/src/gr/fmpz_mod.c
+++ b/src/gr/fmpz_mod.c
@@ -11,6 +11,7 @@
 
 #include "fmpz.h"
 #include "fmpz_factor.h"
+#include "fmpz_vec.h"
 #include "fmpz_mod.h"
 #include "fmpz_mod_mat.h"
 #include "fmpz_mod_poly.h"
@@ -403,8 +404,6 @@ _gr_fmpz_mod_pow_fmpz(fmpz_t res, const fmpz_t x, const fmpz_t exp, const gr_ctx
 int
 _gr_fmpz_mod_vec_dot(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * vec1, const fmpz * vec2, slong len, gr_ctx_t ctx)
 {
-    slong i;
-
     if (len <= 0)
     {
         if (initial == NULL)
@@ -414,28 +413,8 @@ _gr_fmpz_mod_vec_dot(fmpz_t res, const fmpz_t initial, int subtract, const fmpz
         return GR_SUCCESS;
     }
 
-    if (initial == NULL)
-    {
-        fmpz_mul(res, vec1, vec2);
-    }
-    else
-    {
-        if (subtract)
-            fmpz_neg(res, initial);
-        else
-            fmpz_set(res, initial);
-
-        fmpz_addmul(res, vec1, vec2);
-    }
-
-    for (i = 1; i < len; i++)
-        fmpz_addmul(res, vec1 + i, vec2 + i);
-
-    if (subtract)
-        fmpz_neg(res, res);
-
+    _fmpz_vec_dot_general(res, initial, subtract, vec1, vec2, 0, len);
     fmpz_mod_set_fmpz(res, res, FMPZ_MOD_CTX(ctx));
-
     return GR_SUCCESS;
 }
 
@@ -443,8 +422,6 @@ _gr_fmpz_mod_vec_dot(fmpz_t res, const fmpz_t initial, int subtract, const fmpz
 int
 _gr_fmpz_mod_vec_dot_rev(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * vec1, const fmpz * vec2, slong len, gr_ctx_t ctx)
 {
-    slong i;
-
     if (len <= 0)
     {
         if (initial == NULL)
@@ -454,28 +431,8 @@ _gr_fmpz_mod_vec_dot_rev(fmpz_t res, const fmpz_t initial, int subtract, const f
         return GR_SUCCESS;
     }
 
-    if (initial == NULL)
-    {
-        fmpz_mul(res, vec1, vec2 + len - 1);
-    }
-    else
-    {
-        if (subtract)
-            fmpz_neg(res, initial);
-        else
-            fmpz_set(res, initial);
-
-        fmpz_addmul(res, vec1, vec2 + len - 1);
-    }
-
-    for (i = 1; i < len; i++)
-        fmpz_addmul(res, vec1 + i, vec2 + len - 1 - i);
-
-    if (subtract)
-        fmpz_neg(res, res);
-
+    _fmpz_vec_dot_general(res, initial, subtract, vec1, vec2, 1, len);
     fmpz_mod_set_fmpz(res, res, FMPZ_MOD_CTX(ctx));
-
     return GR_SUCCESS;
 }
 

From ad0a17061adefbaaac0635961010547ccb4e7e90 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Wed, 24 Jan 2024 23:53:21 +0100
Subject: [PATCH 3/8] test code

---
 src/fmpz_vec/dot.c                |  6 +++
 src/fmpz_vec/test/main.c          |  2 +
 src/fmpz_vec/test/t-dot_general.c | 82 +++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+)
 create mode 100644 src/fmpz_vec/test/t-dot_general.c

diff --git a/src/fmpz_vec/dot.c b/src/fmpz_vec/dot.c
index 6ddbb23ce7..3c1f86dbfc 100644
--- a/src/fmpz_vec/dot.c
+++ b/src/fmpz_vec/dot.c
@@ -22,6 +22,12 @@ _fmpz_vec_dot_general_naive(fmpz_t res, const fmpz_t initial,
 
     if (initial == NULL)
     {
+        if (len == 0)
+        {
+            fmpz_zero(res);
+            return;
+        }
+
         fmpz_mul(res, a, reverse ? b + len - 1 : b);
 
         if (subtract)
diff --git a/src/fmpz_vec/test/main.c b/src/fmpz_vec/test/main.c
index 9a4681abc3..3fb5d04c70 100644
--- a/src/fmpz_vec/test/main.c
+++ b/src/fmpz_vec/test/main.c
@@ -17,6 +17,7 @@
 #include "t-add.c"
 #include "t-content.c"
 #include "t-dot.c"
+#include "t-dot_general.c"
 #include "t-get_d_vec_2exp.c"
 #include "t-get_set_fft.c"
 #include "t-get_set_nmod_vec.c"
@@ -61,6 +62,7 @@ test_struct tests[] =
     TEST_FUNCTION(fmpz_vec_add),
     TEST_FUNCTION(fmpz_vec_content),
     TEST_FUNCTION(fmpz_vec_dot),
+    TEST_FUNCTION(fmpz_vec_dot_general),
     TEST_FUNCTION(fmpz_vec_get_d_vec_2exp),
     TEST_FUNCTION(fmpz_vec_get_set_fft),
     TEST_FUNCTION(fmpz_vec_get_set_nmod_vec),
diff --git a/src/fmpz_vec/test/t-dot_general.c b/src/fmpz_vec/test/t-dot_general.c
new file mode 100644
index 0000000000..d78baef966
--- /dev/null
+++ b/src/fmpz_vec/test/t-dot_general.c
@@ -0,0 +1,82 @@
+/*
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "test_helpers.h"
+#include "fmpz.h"
+#include "fmpz_vec.h"
+
+TEST_FUNCTION_START(fmpz_vec_dot_general, state)
+{
+    int iter;
+
+    for (iter = 0; iter < 100000 * flint_test_multiplier(); iter++)
+    {
+        fmpz * a, * b;
+        fmpz_t s, t, c;
+        slong n, bits1, bits2;
+        int alias, negate, initial, reverse;
+
+        initial = n_randint(state, 2);
+        alias = n_randint(state, 2);
+        negate = n_randint(state, 2);
+        reverse = n_randint(state, 2);
+        n = n_randint(state, 8);
+
+        if (n_randint(state, 30) == 0)
+            bits1 = 2 + n_randint(state, 20000);
+        else
+            bits1 = 2 + n_randint(state, 1000);
+
+        bits2 = 2 + n_randint(state, 1000);
+
+        a = _fmpz_vec_init(n);
+        b = _fmpz_vec_init(n);
+        fmpz_init(s);
+        fmpz_init(t);
+        fmpz_init(c);
+
+        fmpz_randtest(c, state, bits1);
+        _fmpz_vec_randtest(a, state, n, bits1);
+        _fmpz_vec_randtest(b, state, n, bits2);
+
+        if (initial && alias)
+        {
+            fmpz_set(s, c);
+            fmpz_set(t, c);
+            _fmpz_vec_dot_general(s, s, negate, a, b, reverse, n);
+            _fmpz_vec_dot_general_naive(t, t, negate, a, b, reverse, n);
+        }
+        else
+        {
+            _fmpz_vec_dot_general(s, initial ? c : NULL, negate, a, b, reverse, n);
+            _fmpz_vec_dot_general_naive(t, initial ? c : NULL, negate, a, b, reverse, n);
+        }
+
+        if (!fmpz_equal(s, t) || !_fmpz_is_canonical(s))
+        {
+            flint_printf("negate = %d, initial = %d, reverse = %d, alias = %d\n", negate, initial, reverse, alias);
+            flint_printf("c = %{fmpz}\n\n", c);
+            flint_printf("a = %{fmpz*}\n\n", a, n);
+            flint_printf("b = %{fmpz*}\n\n", b, n);
+            flint_printf("s = %{fmpz}\n\n", s);
+            flint_printf("t = %{fmpz}\n\n", t);
+            flint_abort();
+        }
+
+        fmpz_clear(s);
+        fmpz_clear(t);
+        fmpz_clear(c);
+        _fmpz_vec_clear(a, n);
+        _fmpz_vec_clear(b, n);
+    }
+
+    TEST_FUNCTION_END(state);
+}

From 47d9f1d88c2310871b0deede7ef13dd18a2f1928 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 25 Jan 2024 11:48:20 +0100
Subject: [PATCH 4/8] use fmpz dot products in more cases

---
 src/arith/stirling1.c                       |  9 +--
 src/fmpq_poly/power_sums.c                  | 22 ++----
 src/fmpq_poly/power_sums_to_poly.c          | 13 ++--
 src/fmpq_poly/revert_series_lagrange_fast.c | 13 +---
 src/fmpz_mat/charpoly.c                     | 77 ++-------------------
 src/fmpz_poly/2norm.c                       |  6 +-
 src/fmpz_poly/div_series_basecase.c         |  7 +-
 src/fmpz_poly/gcd_modular.c                 | 11 ++-
 src/fmpz_poly/power_sums_naive.c            | 18 ++---
 src/fmpz_poly/power_sums_to_poly.c          | 17 ++---
 src/fmpz_poly/resultant_modular.c           |  6 +-
 src/nmod_poly/power_sums.c                  |  1 +
 src/nmod_poly/power_sums_to_poly.c          |  1 +
 13 files changed, 53 insertions(+), 148 deletions(-)

diff --git a/src/arith/stirling1.c b/src/arith/stirling1.c
index c389724fda..4d9058bbca 100644
--- a/src/arith/stirling1.c
+++ b/src/arith/stirling1.c
@@ -16,18 +16,15 @@
 #include "arith.h"
 
 /* compute single coefficient in polynomial product */
-static void
+FLINT_FORCE_INLINE void
 _fmpz_poly_mulmid_single(fmpz_t res, const fmpz * poly1, slong len1, const fmpz * poly2, slong len2, slong i)
 {
-    slong j, top1, top2;
+    slong top1, top2;
 
     top1 = FLINT_MIN(len1 - 1, i);
     top2 = FLINT_MIN(len2 - 1, i);
 
-    fmpz_mul(res, poly1 + i - top2, poly2 + top2);
-
-    for (j = 1; j < top1 + top2 - i + 1; j++)
-        fmpz_addmul(res, poly1 + i - top2 + j, poly2 + top2 - j);
+    _fmpz_vec_dot_general(res, NULL, 0, poly1 + i - top2, poly2 + i - top1, 1, top1 + top2 - i + 1);
 }
 
 #define MAX_BASECASE 16
diff --git a/src/fmpq_poly/power_sums.c b/src/fmpq_poly/power_sums.c
index 2eac351e65..d808656b76 100644
--- a/src/fmpq_poly/power_sums.c
+++ b/src/fmpq_poly/power_sums.c
@@ -9,6 +9,7 @@
     (at your option) any later version.  See <https://www.gnu.org/licenses/>.
 */
 
+#include "fmpz_vec.h"
 #include "fmpz_poly.h"
 #include "fmpq.h"
 #include "fmpq_poly.h"
@@ -61,29 +62,20 @@ _fmpq_poly_power_sums(fmpz * res, fmpz_t rden, const fmpz * poly, slong len,
 
         for (k = 1; k < FLINT_MIN(n, len); k++)
         {
-            fmpz_mul_ui(res + k, poly + len - 1 - k, k);
+            fmpz_mul_si(res + k, poly + len - 1 - k, -k);
             fmpz_mul(res + k, res + k, rden);
-
-            for (i = 1; i < k - 1; i++)
-                fmpz_mul(res + i, res + i, poly + len - 1);
-            for (i = 1; i < k; i++)
-                fmpz_addmul(res + k, poly + len - 1 - k + i, res + i);
-            fmpz_neg(res + k, res + k);
+            _fmpz_vec_scalar_mul_fmpz(res + 1, res + 1, k - 2, poly + len - 1);
+            _fmpz_vec_dot_general(res + k, res + k, 1, poly + len - 1 - k + 1, res + 1, 0, k - 1);
             fmpz_mul(rden, rden, poly + len - 1);
         }
 
         for (k = len; k < n; k++)
         {
-            fmpz_zero(res + k);
-            for (i = k - len + 1; i < k - 1; i++)
-                fmpz_mul(res + i, res + i, poly + len - 1);
-            for (i = k - len + 1; i < k; i++)
-                fmpz_addmul(res + k, poly + len - 1 - k + i, res + i);
-            fmpz_neg(res + k, res + k);
+            _fmpz_vec_scalar_mul_fmpz(res + k - len + 1, res + k - len + 1, len - 2, poly + len - 1);
+            _fmpz_vec_dot_general(res + k, NULL, 1, poly, res + k - len + 1, 0, len - 1);
         }
 
-        for (i = n - len + 1; i < n - 1; i++)
-            fmpz_mul(res + i, res + i, poly + len - 1);
+        _fmpz_vec_scalar_mul_fmpz(res + n - len + 1, res + n - len + 1, len - 2, poly + len - 1);
         fmpz_one(rden);
 
         for (i = n - len; i > 0; i--)
diff --git a/src/fmpq_poly/power_sums_to_poly.c b/src/fmpq_poly/power_sums_to_poly.c
index dbea185cd9..b9f82a0dda 100644
--- a/src/fmpq_poly/power_sums_to_poly.c
+++ b/src/fmpq_poly/power_sums_to_poly.c
@@ -11,6 +11,7 @@
 
 #include "ulong_extras.h"
 #include "fmpz.h"
+#include "fmpz_vec.h"
 #include "fmpz_poly.h"
 #include "fmpq_poly.h"
 
@@ -30,12 +31,16 @@ _fmpq_poly_power_sums_to_poly(fmpz * res, const fmpz * poly, const fmpz_t den,
     fmpz_one(f);
     for (k = 1; k <= d; k++)
     {
-        if(k < len)
+        if (k < len)
+        {
 			fmpz_mul(res + d - k, poly + k, f);
+            _fmpz_vec_dot_general(res + d - k, res + d - k, 0, res + d - k + 1, poly + 1, 0, k - 1);
+
+        }
 		else
-			fmpz_zero(res + d - k);
-        for (i = 1; i < FLINT_MIN(k, len); i++)
-            fmpz_addmul(res + d - k, res + d - k + i, poly + i);
+        {
+            _fmpz_vec_dot_general(res + d - k, NULL, 0, res + d - k + 1, poly + 1, 0, len - 1);
+        }
 
         a = n_gcd(FLINT_ABS(fmpz_fdiv_ui(res + d - k, k)), k);
         fmpz_divexact_ui(res + d - k, res + d - k, a);
diff --git a/src/fmpq_poly/revert_series_lagrange_fast.c b/src/fmpq_poly/revert_series_lagrange_fast.c
index 5117f1cef7..d78af92bea 100644
--- a/src/fmpq_poly/revert_series_lagrange_fast.c
+++ b/src/fmpq_poly/revert_series_lagrange_fast.c
@@ -44,9 +44,9 @@ void
 _fmpq_poly_revert_series_lagrange_fast(fmpz * Qinv, fmpz_t den,
                         const fmpz * Q, const fmpz_t Qden, slong Qlen, slong n)
 {
-    slong i, j, k, m;
+    slong i, j, m;
     fmpz *R, *Rden, *S, *T, *dens, *tmp;
-    fmpz_t Sden, Tden, t;
+    fmpz_t Sden, Tden;
 
     if (Qlen <= 2)
     {
@@ -65,7 +65,6 @@ _fmpq_poly_revert_series_lagrange_fast(fmpz * Qinv, fmpz_t den,
 
     m = n_sqrt(n);
 
-    fmpz_init(t);
     dens = _fmpz_vec_init(n);
     R = _fmpz_vec_init((n - 1) * m);
     S = _fmpz_vec_init(n - 1);
@@ -103,12 +102,7 @@ _fmpq_poly_revert_series_lagrange_fast(fmpz * Qinv, fmpz_t den,
 
         for (j = 1; j < m && i + j < n; j++)
         {
-            fmpz_mul(t, S + 0, Ri(j) + i + j - 1);
-
-            for (k = 1; k <= i + j - 1; k++)
-                fmpz_addmul(t, S + k, Ri(j) + i + j - 1 - k);
-
-            fmpz_set(Qinv + i + j, t);
+            _fmpz_vec_dot_general(Qinv + i + j, NULL, 0, S, Ri(j), 1, i + j);
             fmpz_mul(dens + i + j, Sden, Rdeni(j));
             fmpz_mul_ui(dens + i + j, dens + i + j, i + j);
         }
@@ -126,7 +120,6 @@ _fmpq_poly_revert_series_lagrange_fast(fmpz * Qinv, fmpz_t den,
     _set_vec(Qinv, den, Qinv, dens, n);
     _fmpq_poly_canonicalise(Qinv, den, n);
 
-    fmpz_clear(t);
     _fmpz_vec_clear(dens, n);
     _fmpz_vec_clear(R, (n - 1) * m);
     _fmpz_vec_clear(S, n - 1);
diff --git a/src/fmpz_mat/charpoly.c b/src/fmpz_mat/charpoly.c
index 66c3664b4e..c43d5763bc 100644
--- a/src/fmpz_mat/charpoly.c
+++ b/src/fmpz_mat/charpoly.c
@@ -17,6 +17,8 @@
 #include "fmpz_vec.h"
 #include "fmpz_mat.h"
 #include "fmpz_poly.h"
+#include "gr.h"
+#include "gr_mat.h"
 
 /*
     Assumes that \code{mat} is an $n \times n$ matrix and sets \code{(cp,n+1)}
@@ -27,78 +29,9 @@
 
 void _fmpz_mat_charpoly_berkowitz(fmpz *cp, const fmpz_mat_t mat)
 {
-    const slong n = mat->r;
-
-    if (n == 0)
-    {
-        fmpz_one(cp);
-    }
-    else if (n == 1)
-    {
-        fmpz_neg(cp + 0, fmpz_mat_entry(mat, 0, 0));
-        fmpz_one(cp + 1);
-    }
-    else
-    {
-        slong i, j, k, t;
-        fmpz *a, *A, *s;
-
-        a = _fmpz_vec_init(n * n);
-        A = a + (n - 1) * n;
-
-        _fmpz_vec_zero(cp, n + 1);
-        fmpz_neg(cp + 0, fmpz_mat_entry(mat, 0, 0));
-
-        for (t = 1; t < n; t++)
-        {
-            for (i = 0; i <= t; i++)
-            {
-                fmpz_set(a + 0 * n + i, fmpz_mat_entry(mat, i, t));
-            }
-
-            fmpz_set(A + 0, fmpz_mat_entry(mat, t, t));
-
-            for (k = 1; k < t; k++)
-            {
-                for (i = 0; i <= t; i++)
-                {
-                    s = a + k * n + i;
-                    fmpz_zero(s);
-                    for (j = 0; j <= t; j++)
-                    {
-                        fmpz_addmul(s, fmpz_mat_entry(mat, i, j), a + (k - 1) * n + j);
-                    }
-                }
-                fmpz_set(A + k, a + k * n + t);
-            }
-
-            fmpz_zero(A + t);
-            for (j = 0; j <= t; j++)
-            {
-                fmpz_addmul(A + t, fmpz_mat_entry(mat, t, j), a + (t - 1) * n + j);
-            }
-
-            for (k = 0; k <= t; k++)
-            {
-                for (j = 0; j < k; j++)
-                {
-                    fmpz_submul(cp + k, A + j, cp + (k - j - 1));
-                }
-                fmpz_sub(cp + k, cp + k, A + k);
-            }
-        }
-
-        /* Shift all coefficients up by one */
-        for (i = n; i > 0; i--)
-        {
-            fmpz_swap(cp + i, cp + (i - 1));
-        }
-        fmpz_one(cp + 0);
-
-        _fmpz_poly_reverse(cp, cp, n + 1, n + 1);
-
-        _fmpz_vec_clear(a, n * n);
-    }
+    gr_ctx_t ctx;
+    gr_ctx_init_fmpz(ctx);
+    GR_MUST_SUCCEED(_gr_mat_charpoly_berkowitz(cp, (const gr_mat_struct *) mat, ctx));
 }
 
 void fmpz_mat_charpoly_berkowitz(fmpz_poly_t cp, const fmpz_mat_t mat)
diff --git a/src/fmpz_poly/2norm.c b/src/fmpz_poly/2norm.c
index 2ec047c296..e0fc9dabe7 100644
--- a/src/fmpz_poly/2norm.c
+++ b/src/fmpz_poly/2norm.c
@@ -10,15 +10,13 @@
 */
 
 #include "fmpz.h"
+#include "fmpz_vec.h"
 #include "fmpz_poly.h"
 
 void
 _fmpz_poly_2norm(fmpz_t res, const fmpz * poly, slong len)
 {
-    slong i;
-    fmpz_zero(res);
-    for (i = 0; i < len; i++)
-        fmpz_addmul(res, poly + i, poly + i);
+    _fmpz_vec_dot(res, poly, poly, len);
     fmpz_sqrt(res, res);
 }
 
diff --git a/src/fmpz_poly/div_series_basecase.c b/src/fmpz_poly/div_series_basecase.c
index b412f0ba88..479852d459 100644
--- a/src/fmpz_poly/div_series_basecase.c
+++ b/src/fmpz_poly/div_series_basecase.c
@@ -180,10 +180,9 @@ _fmpz_poly_div_series_basecase(fmpz * Q, const fmpz * A, slong Alen,
             }
             else
             {
-                fmpz_mul(Q + i, B + 1, Q + i - 1);
-
-                for (j = 2; j < FLINT_MIN(i + 1, Blen); j++)
-                    fmpz_addmul(Q + i, B + j, Q + i - j);
+                slong l = FLINT_MIN(i, Blen - 1);
+                /* todo: merge final subtraction */
+                _fmpz_vec_dot_general(Q + i, NULL, 0, B + 1, Q + i - l, 1, l);
             }
 
             if (i < Alen)
diff --git a/src/fmpz_poly/gcd_modular.c b/src/fmpz_poly/gcd_modular.c
index fc6390e33f..c6dfb47d95 100644
--- a/src/fmpz_poly/gcd_modular.c
+++ b/src/fmpz_poly/gcd_modular.c
@@ -67,15 +67,12 @@ void _fmpz_poly_gcd_modular(fmpz * res, const fmpz * poly1, slong len1,
 
     if (len1 < 64 && len2 < 64) /* compute the squares of the 2-norms */
     {
-        fmpz_set_ui(l, 0);
-        for (i = 0; i < len1; i++)
-            fmpz_addmul(l, A + i, A + i);
+        _fmpz_vec_dot(l, A, A, len1);
         nb1 = fmpz_bits(l);
-        fmpz_set_ui(l, 0);
-        for (i = 0; i < len2; i++)
-            fmpz_addmul(l, B + i, B + i);
+        _fmpz_vec_dot(l, B, B, len2);
         nb2 = fmpz_bits(l);
-    } else /* approximate to save time */
+    }
+    else /* approximate to save time */
     {
         nb1 = 2*bits1 + FLINT_BIT_COUNT(len1);
         nb2 = 2*bits2 + FLINT_BIT_COUNT(len2);
diff --git a/src/fmpz_poly/power_sums_naive.c b/src/fmpz_poly/power_sums_naive.c
index 16a6c5ccaa..a1fee2d1e1 100644
--- a/src/fmpz_poly/power_sums_naive.c
+++ b/src/fmpz_poly/power_sums_naive.c
@@ -10,28 +10,24 @@
 */
 
 #include "fmpz.h"
+#include "fmpz_vec.h"
 #include "fmpz_poly.h"
 
 void
 _fmpz_poly_power_sums_naive(fmpz * res, const fmpz * poly, slong len, slong n)
 {
-    slong i, k;
+    slong k;
 
     fmpz_set_ui(res, len - 1);
+
     for (k = 1; k < FLINT_MIN(n, len); k++)
     {
-        fmpz_mul_ui(res + k, poly + len - 1 - k, k);
-        for (i = 1; i < k; i++)
-            fmpz_addmul(res + k, poly + len - 1 - k + i, res + i);
-        fmpz_neg(res + k, res + k);
+        fmpz_mul_si(res + k, poly + len - 1 - k, -k);
+        _fmpz_vec_dot_general(res + k, res + k, 1, poly + len - 1 - k + 1, res + 1, 0, k - 1);
     }
+
     for (k = len; k < n; k++)
-    {
-        fmpz_zero(res + k);
-        for (i = k - len + 1; i < k; i++)
-            fmpz_addmul(res + k, poly + len - 1 - k + i, res + i);
-        fmpz_neg(res + k, res + k);
-    }
+        _fmpz_vec_dot_general(res + k, NULL, 1, poly, res + k - len + 1, 0, len - 1);
 }
 
 void
diff --git a/src/fmpz_poly/power_sums_to_poly.c b/src/fmpz_poly/power_sums_to_poly.c
index df00b67ea4..b1cfeb17a0 100644
--- a/src/fmpz_poly/power_sums_to_poly.c
+++ b/src/fmpz_poly/power_sums_to_poly.c
@@ -10,30 +10,25 @@
 */
 
 #include "fmpz.h"
+#include "fmpz_vec.h"
 #include "fmpz_poly.h"
 
 void
 _fmpz_poly_power_sums_to_poly(fmpz * res, const fmpz * poly, slong len)
 {
-    slong i, k;
+    slong k;
     slong d = fmpz_get_ui(poly);
 
     fmpz_one(res + d);
     for (k = 1; k < FLINT_MIN(d + 1, len); k++)
     {
-        fmpz_set(res + d - k, poly + k);
-        for (i = 1; i < k; i++)
-            fmpz_addmul(res + d - k, res + d - k + i, poly + i);
-        fmpz_divexact_si(res + d - k, res + d - k, k);
-        fmpz_neg(res + d - k, res + d - k);
+        _fmpz_vec_dot_general(res + d - k, poly + k, 0, res + d - k + 1, poly + 1, 0, k - 1);
+        fmpz_divexact_si(res + d - k, res + d - k, -k);
     }
     for (k = len; k <= d; k++)
     {
-        fmpz_zero(res + d - k);
-        for (i = 1; i < len; i++)
-            fmpz_addmul(res + d - k, res + d - k + i, poly + i);
-        fmpz_divexact_si(res + d - k, res + d - k, k);
-        fmpz_neg(res + d - k, res + d - k);
+        _fmpz_vec_dot_general(res + d - k, NULL, 0, res + d - k + 1, poly + 1, 0, len - 1);
+        fmpz_divexact_si(res + d - k, res + d - k, -k);
     }
 }
 
diff --git a/src/fmpz_poly/resultant_modular.c b/src/fmpz_poly/resultant_modular.c
index 3ed5bf4f6c..8ea6c4c6e9 100644
--- a/src/fmpz_poly/resultant_modular.c
+++ b/src/fmpz_poly/resultant_modular.c
@@ -68,10 +68,8 @@ void _fmpz_poly_resultant_modular(fmpz_t res, const fmpz * poly1, slong len1,
         fmpz_init(b1);
         fmpz_init(b2);
 
-        for (i = 0; i < len1; i++)
-            fmpz_addmul(b1, A + i, A + i);
-        for (i = 0; i < len2; i++)
-            fmpz_addmul(b2, B + i, B + i);
+        _fmpz_vec_dot(b1, A, A, len1);
+        _fmpz_vec_dot(b2, B, B, len2);
 
         fmpz_pow_ui(b1, b1, len2 - 1);
         fmpz_pow_ui(b2, b2, len1 - 1);
diff --git a/src/nmod_poly/power_sums.c b/src/nmod_poly/power_sums.c
index af187d0e2d..78625f504d 100644
--- a/src/nmod_poly/power_sums.c
+++ b/src/nmod_poly/power_sums.c
@@ -78,6 +78,7 @@ nmod_poly_power_sums(nmod_poly_t res, const nmod_poly_t poly, slong n)
     }
 }
 
+/* todo: should use dot products */
 void
 _nmod_poly_power_sums_naive(mp_ptr res, mp_srcptr poly, slong len, slong n,
                             nmod_t mod)
diff --git a/src/nmod_poly/power_sums_to_poly.c b/src/nmod_poly/power_sums_to_poly.c
index ab351b89b3..1281b55dec 100644
--- a/src/nmod_poly/power_sums_to_poly.c
+++ b/src/nmod_poly/power_sums_to_poly.c
@@ -56,6 +56,7 @@ nmod_poly_power_sums_to_poly(nmod_poly_t res, const nmod_poly_t Q)
     }
 }
 
+/* todo: should use dot products */
 void
 _nmod_poly_power_sums_to_poly_naive(mp_ptr res, mp_srcptr poly, slong len,
                                     nmod_t mod)

From 825c3601d6cd5f7e3f25fe3065d346d67f7179c6 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 25 Jan 2024 11:57:16 +0100
Subject: [PATCH 5/8] deprecate fmpz_mat_mul_classical_inline

---
 src/fmpz_mat.h                      |   4 +-
 src/fmpz_mat/mul_classical_inline.c | 118 ----------------------------
 2 files changed, 2 insertions(+), 120 deletions(-)
 delete mode 100644 src/fmpz_mat/mul_classical_inline.c

diff --git a/src/fmpz_mat.h b/src/fmpz_mat.h
index e379e74c55..64414ddb1c 100644
--- a/src/fmpz_mat.h
+++ b/src/fmpz_mat.h
@@ -174,8 +174,8 @@ void fmpz_mat_mul_classical(fmpz_mat_t C, const fmpz_mat_t A,
 
 void fmpz_mat_mul_strassen(fmpz_mat_t C, const fmpz_mat_t A, const fmpz_mat_t B);
 
-void fmpz_mat_mul_classical_inline(fmpz_mat_t C, const fmpz_mat_t A,
-    const fmpz_mat_t B);
+/* Deprecated */
+#define fmpz_mat_mul_classical_inline fmpz_mat_mul_classical
 
 void _fmpz_mat_mul_fft(fmpz_mat_t C,
                                     const fmpz_mat_t A, slong abits,
diff --git a/src/fmpz_mat/mul_classical_inline.c b/src/fmpz_mat/mul_classical_inline.c
deleted file mode 100644
index 1626b54d8a..0000000000
--- a/src/fmpz_mat/mul_classical_inline.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
-    Copyright (C) 2011 Fredrik Johansson
-
-    This file is part of FLINT.
-
-    FLINT is free software: you can redistribute it and/or modify it under
-    the terms of the GNU Lesser General Public License (LGPL) as published
-    by the Free Software Foundation; either version 2.1 of the License, or
-    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
-*/
-
-#include "gmpcompat.h"
-#include "fmpz.h"
-#include "fmpz_mat.h"
-
-void
-fmpz_mat_mul_classical_inline(fmpz_mat_t C, const fmpz_mat_t A,
-                                                const fmpz_mat_t B)
-{
-    slong ar, bc, br;
-    slong i, j, k;
-
-    fmpz a, b;
-    mpz_t t;
-
-    mp_limb_t au, bu;
-    mp_limb_t pos[3];
-    mp_limb_t neg[3];
-
-    ar = A->r;
-    br = B->r;
-    bc = B->c;
-
-    mpz_init(t);
-
-    for (i = 0; i < ar; i++)
-    {
-        for (j = 0; j < bc; j++)
-        {
-            flint_mpz_set_ui(t, UWORD(0));
-
-            pos[2] = pos[1] = pos[0] = neg[2] = neg[1] = neg[0] = UWORD(0);
-
-            for (k = 0; k < br; k++)
-            {
-                a = A->rows[i][k];
-                b = B->rows[k][j];
-
-                if (a == 0 || b == 0)
-                    continue;
-
-                if (!COEFF_IS_MPZ(a))   /* a is small */
-                {
-                    if (!COEFF_IS_MPZ(b))  /* both are small */
-                    {
-                        au = FLINT_ABS(a);
-                        bu = FLINT_ABS(b);
-
-                        umul_ppmm(au, bu, au, bu);
-
-                        if ((a ^ b) >= WORD(0))
-                            add_sssaaaaaa(pos[2], pos[1], pos[0],
-                                          pos[2], pos[1], pos[0], 0, au, bu);
-                        else
-                            add_sssaaaaaa(neg[2], neg[1], neg[0],
-                                          neg[2], neg[1], neg[0], 0, au, bu);
-                    }
-                    else
-                    {
-                        if (a >= 0)
-                            flint_mpz_addmul_ui(t, COEFF_TO_PTR(b), a);
-                        else
-                            flint_mpz_submul_ui(t, COEFF_TO_PTR(b), -a);
-                    }
-                }
-                else if (!COEFF_IS_MPZ(b))  /* b is small */
-                {
-                    if (b >= 0)
-                        flint_mpz_addmul_ui(t, COEFF_TO_PTR(a), b);
-                    else
-                        flint_mpz_submul_ui(t, COEFF_TO_PTR(a), -b);
-                }
-                else
-                {
-                    mpz_addmul(t, COEFF_TO_PTR(a), COEFF_TO_PTR(b));
-                }
-            }
-
-            if (mpz_sgn(t) != 0 || pos[2] || neg[2] || pos[1] || neg[1])
-            {
-                __mpz_struct r;
-
-                r._mp_size = pos[2] ? 3 : (pos[1] ? 2 : pos[0] != 0);
-                r._mp_alloc = r._mp_size;
-                r._mp_d = pos;
-
-                mpz_add(t, t, &r);
-
-                r._mp_size = neg[2] ? 3 : (neg[1] ? 2 : neg[0] != 0);
-                r._mp_alloc = r._mp_size;
-                r._mp_d = neg;
-
-                mpz_sub(t, t, &r);
-
-                fmpz_set_mpz(fmpz_mat_entry(C, i, j), t);
-            }
-            else
-            {
-                if (neg[0] > pos[0])
-                    fmpz_neg_ui(fmpz_mat_entry(C, i, j), neg[0] - pos[0]);
-                else
-                    fmpz_set_ui(fmpz_mat_entry(C, i, j), pos[0] - neg[0]);
-            }
-        }
-    }
-
-    mpz_clear(t);
-}

From 048a7b6c77c1db906b88c7223ca170f7627af1bd Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 25 Jan 2024 13:20:48 +0100
Subject: [PATCH 6/8] properly deprecate fmpz_mat_mul_classical_inline

---
 src/fmpz_mat.h                           |  3 +--
 src/fmpz_mat/profile/p-mul.c             | 17 ++++-------------
 src/fmpz_mat/profile/p-mul_double_word.c |  2 +-
 src/fmpz_mat/profile/p-mul_multi_mod.c   |  2 +-
 src/fmpz_mat/test/t-mul.c                |  6 +++---
 src/fmpz_mat/test/t-mul_blas.c           |  4 ++--
 src/fmpz_mat/test/t-mul_classical.c      |  9 ++++++++-
 src/fmpz_mat/test/t-mul_double_word.c    |  2 +-
 src/fmpz_mat/test/t-mul_fft.c            |  2 +-
 src/fmpz_mat/test/t-mul_multi_mod.c      |  4 ++--
 src/fmpz_mat/test/t-mul_small.c          |  2 +-
 11 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/src/fmpz_mat.h b/src/fmpz_mat.h
index 64414ddb1c..f7009d2c92 100644
--- a/src/fmpz_mat.h
+++ b/src/fmpz_mat.h
@@ -174,8 +174,7 @@ void fmpz_mat_mul_classical(fmpz_mat_t C, const fmpz_mat_t A,
 
 void fmpz_mat_mul_strassen(fmpz_mat_t C, const fmpz_mat_t A, const fmpz_mat_t B);
 
-/* Deprecated */
-#define fmpz_mat_mul_classical_inline fmpz_mat_mul_classical
+#define fmpz_mat_mul_classical_inline _Pragma("GCC error \"'fmpz_mat_mul_classical_inline' is deprecated. Use 'fmpz_mat_mul_classical' instead.\"")
 
 void _fmpz_mat_mul_fft(fmpz_mat_t C,
                                     const fmpz_mat_t A, slong abits,
diff --git a/src/fmpz_mat/profile/p-mul.c b/src/fmpz_mat/profile/p-mul.c
index 3c1240e302..e999601a37 100644
--- a/src/fmpz_mat/profile/p-mul.c
+++ b/src/fmpz_mat/profile/p-mul.c
@@ -50,9 +50,6 @@ void sample(void * arg, ulong count)
     else if (algorithm == 1)
         for (i = 0; i < count; i++)
             fmpz_mat_mul_classical(C, A, B);
-    else if (algorithm == 2)
-        for (i = 0; i < count; i++)
-            fmpz_mat_mul_classical_inline(C, A, B);
     else if (algorithm == 3)
         for (i = 0; i < count; i++)
             fmpz_mat_mul_multi_mod(C, A, B);
@@ -71,7 +68,7 @@ void sample(void * arg, ulong count)
 
 int main(void)
 {
-    double min_default, min_classical, min_inline, min_multi_mod, min_strassen, max;
+    double min_default, min_classical, min_multi_mod, min_strassen, max;
     mat_mul_t params;
     slong bits, dim;
 
@@ -93,28 +90,22 @@ int main(void)
             params.algorithm = 1;
             prof_repeat(&min_classical, &max, sample, &params);
 
-            params.algorithm = 2;
-            prof_repeat(&min_inline, &max, sample, &params);
-
             params.algorithm = 3;
             prof_repeat(&min_multi_mod, &max, sample, &params);
 
             params.algorithm = 4;
             prof_repeat(&min_strassen, &max, sample, &params);
 
-            flint_printf("dim = %wd default/classical/inline/multi_mod/strassen %.2f %.2f %.2f %.2f %.2f (us)\n",
-                dim, min_default, min_classical, min_inline, min_multi_mod, min_strassen);
+            flint_printf("dim = %wd default/classical/multi_mod/strassen %.2f %.2f %.2f %.2f (us)\n",
+                dim, min_default, min_classical, min_multi_mod, min_strassen);
 
             if (min_multi_mod < 0.6*min_default)
                 flint_printf("BAD!\n");
 
-            if (min_inline < 0.6*min_default)
-                flint_printf("BAD!\n");
-
             if (min_strassen < 0.7*min_default)
                 flint_printf("BAD!\n");
 
-            if (min_multi_mod < 0.7*min_inline)
+            if (min_multi_mod < 0.7*min_classical)
                 break;
         }
     }
diff --git a/src/fmpz_mat/profile/p-mul_double_word.c b/src/fmpz_mat/profile/p-mul_double_word.c
index 2631b056af..38c11f354e 100644
--- a/src/fmpz_mat/profile/p-mul_double_word.c
+++ b/src/fmpz_mat/profile/p-mul_double_word.c
@@ -71,7 +71,7 @@ int main(void)
 
             if (dim < 150)
             {
-                fmpz_mat_mul_classical_inline(D, A, B);
+                fmpz_mat_mul_classical(D, A, B);
 
                 if (!fmpz_mat_equal(D, E))
                 {
diff --git a/src/fmpz_mat/profile/p-mul_multi_mod.c b/src/fmpz_mat/profile/p-mul_multi_mod.c
index b6cf3167c9..135a4d1e4a 100644
--- a/src/fmpz_mat/profile/p-mul_multi_mod.c
+++ b/src/fmpz_mat/profile/p-mul_multi_mod.c
@@ -61,7 +61,7 @@ int main(void)
 
             if (dim < 150)
             {
-                fmpz_mat_mul_classical_inline(D, A, B);
+                fmpz_mat_mul_classical(D, A, B);
 
                 if (!fmpz_mat_equal(D, E))
                 {
diff --git a/src/fmpz_mat/test/t-mul.c b/src/fmpz_mat/test/t-mul.c
index ba71570382..2456038115 100644
--- a/src/fmpz_mat/test/t-mul.c
+++ b/src/fmpz_mat/test/t-mul.c
@@ -55,7 +55,7 @@ TEST_FUNCTION_START(fmpz_mat_mul, state)
         fmpz_mat_randtest(C, state, n_randint(state, 200) + 1);
 
         fmpz_mat_mul(C, A, B);
-        fmpz_mat_mul_classical_inline(D, A, B);
+        fmpz_mat_mul_classical(D, A, B);
 
         if (!fmpz_mat_equal(C, D))
         {
@@ -169,8 +169,8 @@ TEST_FUNCTION_START(fmpz_mat_mul, state)
         if (!fmpz_mat_equal(A, B))
         {
             flint_printf("FAIL: window aliasing failed\n");
-	    fmpz_mat_print(A); flint_printf("\n\n");
-	    fmpz_mat_print(B); flint_printf("\n\n");
+            fmpz_mat_print(A); flint_printf("\n\n");
+            fmpz_mat_print(B); flint_printf("\n\n");
             fflush(stdout);
             flint_abort();
         }
diff --git a/src/fmpz_mat/test/t-mul_blas.c b/src/fmpz_mat/test/t-mul_blas.c
index cb856966c6..d06b2a5c26 100644
--- a/src/fmpz_mat/test/t-mul_blas.c
+++ b/src/fmpz_mat/test/t-mul_blas.c
@@ -44,7 +44,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_blas, state)
         /* Make sure noise in the output is ok */
         fmpz_mat_randtest(C, state, n_randint(state, 200) + 1);
 
-        fmpz_mat_mul_classical_inline(C, A, B);
+        fmpz_mat_mul_classical(C, A, B);
         if (fmpz_mat_mul_blas(D, A, B))
         {
             if (!fmpz_mat_equal(C, D))
@@ -92,7 +92,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_blas, state)
         fmpz_mat_randtest(C, state, n_randint(state, 200) + 1);
         fmpz_mat_randtest(D, state, n_randint(state, 200) + 1);
 
-        fmpz_mat_mul_classical_inline(C, A, B);
+        fmpz_mat_mul_classical(C, A, B);
         if (fmpz_mat_mul_blas(D, A, B))
         {
             if (!fmpz_mat_equal(C, D))
diff --git a/src/fmpz_mat/test/t-mul_classical.c b/src/fmpz_mat/test/t-mul_classical.c
index 679ce9494e..e28bbab32c 100644
--- a/src/fmpz_mat/test/t-mul_classical.c
+++ b/src/fmpz_mat/test/t-mul_classical.c
@@ -19,6 +19,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_classical, state)
     for (i = 0; i < 100 * flint_test_multiplier(); i++)
     {
         slong m, n, k;
+        slong i, j, h;
 
         m = n_randint(state, 50);
         n = n_randint(state, 50);
@@ -36,7 +37,13 @@ TEST_FUNCTION_START(fmpz_mat_mul_classical, state)
         fmpz_mat_randtest(C, state, n_randint(state, 200) + 1);
 
         fmpz_mat_mul_classical(C, A, B);
-        fmpz_mat_mul_classical_inline(D, A, B);
+
+        for (i = 0; i < D->r; i++)
+            for (j = 0; j < D->c; j++)
+                for (h = 0; h < B->r; h++)
+                    fmpz_addmul(fmpz_mat_entry(D, i, j),
+                        fmpz_mat_entry(A, i, h),
+                        fmpz_mat_entry(B, h, j));
 
         if (!fmpz_mat_equal(C, D))
         {
diff --git a/src/fmpz_mat/test/t-mul_double_word.c b/src/fmpz_mat/test/t-mul_double_word.c
index 0d63157a11..93d3f9b355 100644
--- a/src/fmpz_mat/test/t-mul_double_word.c
+++ b/src/fmpz_mat/test/t-mul_double_word.c
@@ -87,7 +87,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_double_word, state)
         fmpz_mat_randtest(C, state, n_randint(state, 200) + 1);
 
         _fmpz_mat_mul_double_word(C, A, B);
-        fmpz_mat_mul_classical_inline(D, A, B);
+        fmpz_mat_mul_classical(D, A, B);
 
         if (!fmpz_mat_equal(C, D))
         {
diff --git a/src/fmpz_mat/test/t-mul_fft.c b/src/fmpz_mat/test/t-mul_fft.c
index dc60e0ac89..56fc9fabdf 100644
--- a/src/fmpz_mat/test/t-mul_fft.c
+++ b/src/fmpz_mat/test/t-mul_fft.c
@@ -38,7 +38,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_fft, state)
         fmpz_mat_randtest(C, state, n_randint(state, 2000) + 1);
 
         fmpz_mat_mul_fft(C, A, B);
-        fmpz_mat_mul_classical_inline(D, A, B);
+        fmpz_mat_mul_classical(D, A, B);
 
         if (!fmpz_mat_equal(C, D))
         {
diff --git a/src/fmpz_mat/test/t-mul_multi_mod.c b/src/fmpz_mat/test/t-mul_multi_mod.c
index 07df123128..24ecf6098d 100644
--- a/src/fmpz_mat/test/t-mul_multi_mod.c
+++ b/src/fmpz_mat/test/t-mul_multi_mod.c
@@ -36,7 +36,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_multi_mod, state)
         /* Make sure noise in the output is ok */
         fmpz_mat_randtest(C, state, n_randint(state, 200) + 1);
 
-        fmpz_mat_mul_classical_inline(C, A, B);
+        fmpz_mat_mul_classical(C, A, B);
         fmpz_mat_mul_multi_mod(D, A, B);
 
         if (!fmpz_mat_equal(C, D))
@@ -71,7 +71,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_multi_mod, state)
         /* Make sure noise in the output is ok */
         fmpz_mat_randtest(C, state, n_randint(state, 200) + 1);
 
-        fmpz_mat_mul_classical_inline(C, A, B);
+        fmpz_mat_mul_classical(C, A, B);
         fmpz_mat_mul_multi_mod(D, A, B);
 
         if (!fmpz_mat_equal(C, D))
diff --git a/src/fmpz_mat/test/t-mul_small.c b/src/fmpz_mat/test/t-mul_small.c
index 1db309bc00..2bda60b825 100644
--- a/src/fmpz_mat/test/t-mul_small.c
+++ b/src/fmpz_mat/test/t-mul_small.c
@@ -47,7 +47,7 @@ TEST_FUNCTION_START(fmpz_mat_mul_small, state)
         fmpz_mat_randtest(D, state, n_randint(state, 200) + 1);
 
         _fmpz_mat_mul_small(C, A, B);
-        fmpz_mat_mul_classical_inline(D, A, B);
+        fmpz_mat_mul_classical(D, A, B);
 
         if (!fmpz_mat_equal(C, D))
         {

From 6c0afe742cd7c51f72942e5071c65af11cbe502d Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 25 Jan 2024 13:30:27 +0100
Subject: [PATCH 7/8] documentation

---
 doc/source/fmpz_vec.rst | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/doc/source/fmpz_vec.rst b/doc/source/fmpz_vec.rst
index 23957dcc3f..1c7f3d8c78 100644
--- a/doc/source/fmpz_vec.rst
+++ b/doc/source/fmpz_vec.rst
@@ -408,13 +408,22 @@ Gaussian content
 Dot product
 --------------------------------------------------------------------------------
 
+.. function:: void _fmpz_vec_dot_general_naive(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * a, const fmpz * b, int reverse, slong len)
+              void _fmpz_vec_dot_general(fmpz_t res, const fmpz_t initial, int subtract, const fmpz * a, const fmpz * b, int reverse, slong len)
+
+    Computes the dot product of the vectors *a* and *b*, setting
+    *res* to `s + (-1)^{subtract} \sum_{i=0}^{len-1} a_i b_i`.
+    The initial term *s* is optional and can be
+    omitted by passing *NULL* (equivalently, `s = 0`).
+    The parameter *subtract* must be 0 or 1.
+    If the *reverse* flag is 1, the second vector is reversed.
+
+    Aliasing is allowed between ``res`` and ``initial`` but not
+    between ``res`` and the entries of ``a`` and ``b``.
+
+    The *naive* version is used for testing purposes.
 
 .. function:: void _fmpz_vec_dot(fmpz_t res, const fmpz * vec1, const fmpz * vec2, slong len2)
 
     Sets ``res`` to the dot product of ``(vec1, len2)`` and
     ``(vec2, len2)``.
-
-.. function:: void _fmpz_vec_dot_ptr(fmpz_t res, const fmpz * vec1, fmpz ** const vec2, slong offset, slong len)
-
-    Sets ``res`` to the dot product of ``len`` values at ``vec1`` and the
-    ``len`` values ``vec2[i] + offset`` for `0 \leq i < len`.

From 5f283d7fd57e8668a6e6875ae40eb9c527de6bfd Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 25 Jan 2024 14:18:06 +0100
Subject: [PATCH 8/8] re-enable special case

---
 src/fmpz_vec/dot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fmpz_vec/dot.c b/src/fmpz_vec/dot.c
index 3c1f86dbfc..548c71bf2a 100644
--- a/src/fmpz_vec/dot.c
+++ b/src/fmpz_vec/dot.c
@@ -164,7 +164,7 @@ _fmpz_vec_dot_general(fmpz_t res, const fmpz_t initial, int subtract,
 
     slong i;
 
-    if (len <= 1 && initial == NULL)
+    if (len <= 1)
     {
         if (initial == NULL)
         {