ks-multiply.c

/* Polynomial multiplication using GMP's integer multiplication code

Copyright 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Dave Newman,
Paul Zimmermann, Alexander Kruppa.

This file is part of the ECM Library.

The ECM Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The ECM Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the ECM Library; see the file COPYING.LIB.  If not, see
http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */

#include <stdlib.h>
#include "ecm-gmp.h" /* for MPZ_REALLOC and MPN_COPY */
#include "ecm-impl.h"

#if defined(HAVE___GMPN_MULMOD_BNM1) && defined(HAVE___GMPN_MULMOD_BNM1_NEXT_SIZE)
#define FFT_WRAP /* use the wrap-around trick */
#endif

/* Copy at r+i*s the content of A[i*stride] for 0 <= i < l
   Assume all A[i*stride] are non-negative, and their size is <= s.
 */
static void
pack (mp_ptr r, mpz_t *A, mp_size_t l, mp_size_t stride, mp_size_t s)
{
  mp_size_t i, j, m;

  for (i = 0, j = 0; i < l; i++, j += stride, r += s)
    {
      m = SIZ(A[j]);
      ASSERT((0 <= m) && (m <= s));
      if (m)
        MPN_COPY (r, PTR(A[j]), m);
      if (m < s)
        MPN_ZERO (r + m, s - m);
    }
}

/* put in R[i*stride] for 0 <= i < l the content of {t+i*s, s} */
void
unpack (mpz_t *R, mp_size_t stride, mp_ptr t, mp_size_t l, mp_size_t s)
{
  mp_size_t i, j, size_tmp;
  mp_ptr r_ptr;

  for (i = 0, j = 0; i < l; i++, t += s, j += stride)
    {
      size_tmp = s;
      MPN_NORMALIZE(t, size_tmp); /* compute the actual size */
      r_ptr = MPZ_REALLOC (R[j], size_tmp);
      if (size_tmp)
        MPN_COPY (r_ptr, t, size_tmp);
      SIZ(R[j]) = size_tmp;
    }
}

/* R <- A * B where A = A[0] + A[1]*x + ... + A[n-1]*x^(n-1), idem for B */
void
list_mul_n_basecase (listz_t R, listz_t A, listz_t B, unsigned int n)
{
  unsigned int i, j;

  if (n == 1)
    {
      mpz_mul (R[0], A[0], B[0]);
      return;
    }

  for (i = 0; i < n; i++)
    for (j = 0; j < n; j++)
      {
        if (i == 0 || j == n - 1)
          mpz_mul (R[i+j], A[i], B[j]);
        else
          mpz_addmul (R[i+j], A[i], B[j]);
      }
}

static void
list_mul_n_kara2 (listz_t R, listz_t A, listz_t B)
{
  mpz_add (R[0], A[0], A[1]);
  mpz_add (R[2], B[0], B[1]);
  mpz_mul (R[1], R[0], R[2]);
  mpz_mul (R[0], A[0], B[0]);
  mpz_mul (R[2], A[1], B[1]);
  mpz_sub (R[1], R[1], R[0]);
  mpz_sub (R[1], R[1], R[2]);
}

/* R[0..4] <- A[0..2] * B[0..2] in 7 multiplies */
static void
list_mul_n_kara3 (listz_t R, listz_t A, listz_t B, listz_t T)
{
  mpz_add (T[0], A[0], A[2]);
  mpz_add (R[0], B[0], B[2]);
  mpz_mul (R[2], T[0], R[0]); /* (A0+A2)*(B0+B2) */
  mpz_mul (R[3], T[0], B[1]); /* (A0+A2)*B1 */
  mpz_mul (R[4], A[1], R[0]); /* A1*(B0+B2) */
  mpz_add (R[3], R[3], R[4]); /* (A0+A2)*B1+A1*(B0+B2) */
  list_mul_n_kara2 (T, A, B);
  mpz_sub (R[2], R[2], T[0]); /* A0*A2+A2*B0+A2*B2 */
  mpz_sub (R[3], R[3], T[1]); /* A2*B1+A1*B2 */
  mpz_add (R[2], R[2], T[2]); /* A0*A2+A2*B0+A2*B2+A1*B1 */
  mpz_swap (R[0], T[0]);      /* A0*B0 */
  mpz_swap (R[1], T[1]);      /* A0*B1+A1*B0 */
  mpz_mul (R[4], A[2], B[2]); /* A2*B2 */
  mpz_sub (R[2], R[2], R[4]); /* A0*A2+A2*B0+A1*B1 */
}

/* Assume n >= 2. T is a scratch space of enough entries. */
static void
list_mul_n_karatsuba_aux (listz_t R, listz_t A, listz_t B, unsigned int n,
                          listz_t T)
{
  unsigned int h, l;

  if (n == 1)
    {
      list_mul_n_basecase (R, A, B, n);
      return;
    }

  if (n == 2)
    {
      list_mul_n_kara2 (R, A, B);
      return;
    }

  if (n == 3)
    {
      list_mul_n_kara3 (R, A, B, T);
      return;
    }

  h = n / 2;
  l = n - h;
  list_add (R, A, A + l, h);
  list_add (R + l, B, B + l, h);
  if (h < l)
    {
      mpz_set (R[h], A[h]);
      mpz_set (R[l + h], B[h]);
    }
  list_mul_n_karatsuba_aux (T, R, R + l, l, T + 2 * l - 1);
  list_mul_n_karatsuba_aux (R, A, B, l, T + 2 * l - 1);
  /* {R,2l-1} = Al * Bl */
  list_mul_n_karatsuba_aux (R + 2 * l, A + l, B + l, h, T + 2 * l - 1);
  /* {R+2l,2h-1} = Ah * Bh */
  /* T will contain Al*Bh+Ah*Bl, it thus suffices to compute its low n-1
     coefficients */
  list_sub (T, T, R, n - 1);
  list_sub (T, T, R + 2 * l, 2 * h - 1);
  mpz_set_ui (R[2 * l - 1], 0);
  list_add (R + l, R + l, T, n - 1);
}

static unsigned int
list_mul_n_mem (unsigned int n)
{
  if (n == 1)
    return 0;
  else
    {
      unsigned int k = (n + 1) / 2;
      return 2 * k - 1 + list_mul_n_mem (k);
    }
}

void
list_mul_n_karatsuba (listz_t R, listz_t A, listz_t B, unsigned int n)
{
  listz_t T;
  unsigned int s;

  s = list_mul_n_mem (n);
  T = init_list (s);
  list_mul_n_karatsuba_aux (R, A, B, n, T);
  clear_list (T, s);
}

/* Classical one-point Kronecker-Schoenhage substitution.
   Notes:
    - this code aligns the coeffs at limb boundaries - if instead we aligned
      at byte boundaries then we could save up to 3*n bytes,
      but tests have shown this doesn't give any significant speed increase,
      even for large degree polynomials.
     - this code requires that all coefficients A[] and B[] are nonnegative. */
void
list_mul_n_KS1 (listz_t R, listz_t A, listz_t B, unsigned int l)
{
  unsigned long i;
  mp_size_t s, t = 0, size_t0;
  mp_ptr t0_ptr, t1_ptr, t2_ptr;

  /* compute the largest bit-size t of the A[i] and B[i] */
  for (i = 0; i < l; i++)
    {
      if ((s = mpz_sizeinbase (A[i], 2)) > t)
        t = s;
      if ((s = mpz_sizeinbase (B[i], 2)) > t)
        t = s;
    }
  /* For n > 0, s = sizeinbase (n, 2)     ==> n < 2^s. 
     For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
     Hence all A[i], B[i] < 2^t */
  
  /* Each coeff of A(x)*B(x) < l * 2^(2*t), so max number of bits in a 
     coeff of the product will be 2 * t + ceil(log_2(l)) */
  s = 2 * t;
  for (i = l; i > 1; s++, i = (i + 1) >> 1);
  
  /* work out the corresponding number of limbs */
  s = 1 + (s - 1) / GMP_NUMB_BITS;

  size_t0 = s * l;

  /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
  t0_ptr = (mp_ptr) malloc (4 * size_t0 * sizeof (mp_limb_t));
  if (t0_ptr == NULL)
    {
      outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
      exit (1);
    }
  t1_ptr = t0_ptr + size_t0;
  t2_ptr = t1_ptr + size_t0;
    
  pack (t0_ptr, A, l, 1, s);
  pack (t1_ptr, B, l, 1, s);

  mpn_mul_n (t2_ptr, t0_ptr, t1_ptr, size_t0);

  unpack (R, 1, t2_ptr, 2 * l - 1, s);

  free (t0_ptr);
}

/* Two-point Kronecker substitition.
   Reference: Algorithm 2 from "Faster polynomial multiplication via multipoint
   Kronecker substitution", David Harvey, Journal of Symbolic Computation,
   number 44 (2009), pages 1502-1510.
   Assume n >= 2.
   Notes:
    - this code aligns the coeffs at limb boundaries - if instead we aligned
      at byte boundaries then we could save up to 3*n bytes,
      but tests have shown this doesn't give any significant speed increase,
      even for large degree polynomials.
     - this code requires that all coefficients A[] and B[] are nonnegative.
*/
void
list_mul_n_KS2 (listz_t R, listz_t A, listz_t B, unsigned int n)
{
  unsigned long i;
  mp_size_t s, s2, t = 0, l, h, ns2;
  mp_ptr tmp, A0, A1, B0, B1, C0, C1;
  int sA, sB;

  ASSERT_ALWAYS (n >= 2);

  /* compute the largest bit-size t of the A[i] and B[i] */
  for (i = 0; i < n; i++)
    {
      if ((s = mpz_sizeinbase (A[i], 2)) > t)
        t = s;
      if ((s = mpz_sizeinbase (B[i], 2)) > t)
        t = s;
    }
  /* For n > 0, s = sizeinbase (n, 2)     ==> n < 2^s. 
     For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s.
     Hence all A[i], B[i] < 2^t */
  
  /* Each coeff of A(x)*B(x) < n * 2^(2*t), so max number of bits in a 
     coeff of the product will be 2 * t + ceil(log_2(n)) */
  s = 2 * t;
  for (i = n; i > 1; s++, i = (i + 1) >> 1);
  
  /* work out the corresponding number of limbs */
  s = 1 + (s - 1) / GMP_NUMB_BITS;

  /* ensure s is even */
  s = s + (s & 1);
  s2 = s >> 1;
  ns2 = n * s2;

  l = n / 2;
  h = n - l;

  /* allocate a single buffer to save malloc/MPN_ZERO/free calls */
  tmp = (mp_ptr) malloc (8 * ns2 * sizeof (mp_limb_t));
  if (tmp == NULL)
    {
      outputf (OUTPUT_ERROR, "Out of memory in list_mult_n()\n");
      exit (1);
    }

  A0 = tmp;
  A1 = A0 + ns2;
  B0 = A1 + ns2;
  B1 = B0 + ns2;
  C0 = B1 + ns2;
  C1 = C0 + 2 * ns2;

  pack (A0, A, h, 2, s); /* A0 = Aeven(S) where S = 2^(s*GMP_NUMB_BITS) */
  /* A0 has in fact only n * s2 significant limbs:
     if n=2h, h*s = n*s2
     if n=2h-1, the last chunk from A0 has at most s2 limbs */
  MPN_ZERO(B0, s2);
  pack (B0 + s2, A + 1, l, 2, s);
  /* for the same reason as above, we have at most l*s-s2 significant limbs
     at B0+s2, thus at most l*s <= n*s2 at B0 */
  if ((sA = mpn_cmp (A0, B0, ns2)) >= 0)
    mpn_sub_n (A1, A0, B0, ns2);
  else
    mpn_sub_n (A1, B0, A0, ns2);
  mpn_add_n (A0, A0, B0, ns2);
  /* now A0 is X+ with the notations of Algorithm, A1 is sA*X- */

  pack (B0, B, h, 2, s);
  MPN_ZERO(C0, s2);
  pack (C0 + s2, B + 1, l, 2, s);
  if ((sB = mpn_cmp (B0, C0, ns2)) >= 0)
    mpn_sub_n (B1, B0, C0, ns2);
  else
    mpn_sub_n (B1, C0, B0, ns2);
  mpn_add_n (B0, B0, C0, ns2);
  /* B0 is Y+, B1 is sB*Y- with the notations of Algorithm 2 */

  mpn_mul_n (C0, A0, B0, ns2); /* C0 is Z+ = X+ * Y+ */
  mpn_mul_n (C1, A1, B1, ns2); /* C1 is sA * sB * Z- */

  if (sA * sB >= 0)
    {
      mpn_add_n (A0, C0, C1, 2 * ns2);
      mpn_sub_n (B0, C0, C1, 2 * ns2);
    }
  else
    {
      mpn_sub_n (A0, C0, C1, 2 * ns2);
      mpn_add_n (B0, C0, C1, 2 * ns2);
    }
  mpn_rshift (A0, A0, 4 * ns2, 1); /* global division by 2 */

  /* If A[] and B[] have n coefficients, the product has 2n-1 coefficients.
     The even part has n coefficients and the odd part n-1 coefficients */
  unpack (R, 2, A0, n, s);
  unpack (R + 1, 2, B0 + s2, n - 1, s);

  free (tmp);
}

/* Puts in R[0..2n-2] the product of A[0..n-1] and B[0..n-1], seen as
   polynomials.
*/
void
list_mult_n (listz_t R, listz_t A, listz_t B, unsigned int n)
{
  int T[TUNE_LIST_MUL_N_MAX_SIZE] = LIST_MUL_TABLE, best;

  /* See tune_list_mul_n() in tune.c:
     0 : list_mul_n_basecase
     2 : list_mul_n_KS1
     3 : list_mul_n_KS2 */
  best = (n < TUNE_LIST_MUL_N_MAX_SIZE) ? T[n] : 3;

  if (best == 0)
    list_mul_n_basecase (R, A, B, n);
  else if (best == 1)
    list_mul_n_karatsuba (R, A, B, n);
  else if (best == 2)
    list_mul_n_KS1 (R, A, B, n);
  else
    list_mul_n_KS2 (R, A, B, n);
}

/* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients
   of degree m to n+m of rev(a)*c, i.e.
   b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l)
   ...
   b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k)
   ...
   b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n].

   If rev=0, consider a instead of rev(a).

   Assumes n <= l.

   Return non-zero if an error occurred.

   low(b) is the coefficients of degree 0 to m-1 of a*c (or rev(a)*c)
   mid(b) is the coefficients of degree m to m+n of a*c
   high(b) is the coefficients of degree m+n+1 to m+l+1 of a*c
*/

int
TMulKS (listz_t b, unsigned int n, listz_t a, unsigned int m,
        listz_t c, unsigned int l, mpz_t modulus, int rev)
{
  unsigned long i, s = 0, t;
  mp_ptr ap, bp, cp;
  mp_size_t an, bn, cn;
  int ret = 0; /* default return value */
#ifdef DEBUG
  long st = cputime ();
  fprintf (ECM_STDOUT, "n=%u m=%u l=%u bits=%u n*bits=%u: ", n, m, l,
	   mpz_sizeinbase (modulus, 2), n * mpz_sizeinbase (modulus, 2));
#endif

  ASSERT (n <= l); /* otherwise the upper coefficients of b are 0 */
  if (l > n + m)
    l = n + m; /* otherwise, c has too many coeffs */

  /* make coefficients a[] and c[] non-negative and compute max #bits */
  for (i = 0; i <= m; i++)
    {
      if (mpz_sgn (a[i]) < 0)
        mpz_mod (a[i], a[i], modulus);
      if ((t = mpz_sizeinbase (a[i], 2)) > s)
        s = t;
    }
  for (i = 0; i <= l; i++)
    {
      if (mpz_sgn (c[i]) < 0)
        mpz_mod (c[i], c[i], modulus);
      if ((t = mpz_sizeinbase (c[i], 2)) > s)
        s = t;
    }

#ifdef FFT_WRAP
  s ++; /* need one extra bit to prevent carry of low(b) + high(b) */
#endif

  /* max coeff has 2*s+ceil(log2(min(m+1,l+1))) bits,
   i.e. 2*s + 1 + floor(log2(min(m,l))) */
  for (s = 2 * s, i = (m < l) ? m : l; i; s++, i >>= 1);

  /* corresponding number of limbs */
  s = 1 + (s - 1) / GMP_NUMB_BITS;

  an = (m + 1) * s;
  cn = (l + 1) * s;
  bn = an + cn;

  /* a[0..m] needs (m+1) * s limbs */
  ap = (mp_ptr) malloc (an * sizeof (mp_limb_t));
  if (ap == NULL)
    {
      ret = 1;
      goto TMulKS_end;
    }
  cp = (mp_ptr) malloc (cn * sizeof (mp_limb_t));
  if (cp == NULL)
    {
      ret = 1;
      goto TMulKS_free_ap;
    }

  MPN_ZERO (ap, an);
  MPN_ZERO (cp, cn);

  /* a is reverted */
  for (i = 0; i <= m; i++)
    if (SIZ(a[i]))
      MPN_COPY (ap + ((rev) ? (m - i) : i) * s, PTR(a[i]), SIZ(a[i]));
  for (i = 0; i <= l; i++)
    if (SIZ(c[i]))
      MPN_COPY (cp + i * s, PTR(c[i]), SIZ(c[i]));

#ifdef FFT_WRAP
  /* the product rev(a) * c has m+l+1 coefficients.
     We throw away the first m and the last l-n <= m.
     If we compute mod (m+n+1) * s limbs, we are ok */
  bn = mpn_mulmod_bnm1_next_size ((m + n + 1) * s);
  
  bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
  if (bp == NULL)
    {
      ret = 1;
      goto TMulKS_free_cp;
    }
  {
    mp_ptr tp;
    tp = (mp_ptr) malloc ((2 * bn + 4) * sizeof (mp_limb_t));
    if (tp == NULL)
      {
        ret = 1;
        goto TMulKS_free_cp;
      }
    /* mpn_mulmod_bnm1 requires that the first operand is larger */
    if (an >= cn)
      mpn_mulmod_bnm1 (bp, bn, ap, an, cp, cn, tp);
    else
      mpn_mulmod_bnm1 (bp, bn, cp, cn, ap, an, tp);
    free (tp);
  }
#else /* FFT_WRAP is not defined */
  bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
  if (bp == NULL)
    {
      ret = 1;
      goto TMulKS_free_cp;
    }
  if (an >= cn)
    mpn_mul (bp, ap, an, cp, cn);
  else
    mpn_mul (bp, cp, cn, ap, an);
#endif

  /* recover coefficients of degree m to n+m of product in b[0..n] */
  bp += m * s;
  for (i = 0; i <= n; i++)
    {
      t = s;
      MPN_NORMALIZE(bp, t);
      MPZ_REALLOC (b[i], (mp_size_t) t);
      if (t)
        MPN_COPY (PTR(b[i]), bp, t);
      SIZ(b[i]) = t;
      bp += s;
    }
  bp -= (m + n + 1) * s;

  free (bp);
 TMulKS_free_cp:
  free (cp);
 TMulKS_free_ap:
  free (ap);

#ifdef DEBUG
  fprintf (ECM_STDOUT, "%ldms\n", elltime (st, cputime ()));
#endif
  
 TMulKS_end:
  return ret;
}

unsigned int
ks_wrapmul_m (unsigned int m0, unsigned int k, mpz_t n)
{
#ifdef FFT_WRAP
  mp_size_t t, s;
  unsigned long i, m;

  t = mpz_sizeinbase (n, 2);
  s = t * 2 + 1;
  for (i = k - 1; i; s++, i >>= 1);
  s = 1 + (s - 1) / GMP_NUMB_BITS;
  i = mpn_mulmod_bnm1_next_size (m0 * s);
  while (i % s)
    i = mpn_mulmod_bnm1_next_size (i + 1);
  m = i / s;
  return m;
#else
  return ~ (unsigned int) 0;
#endif
}

/* multiply in R[] A[0]+A[1]*x+...+A[k-1]*x^(k-1)
                by B[0]+B[1]*x+...+B[l-1]*x^(l-1) modulo n,
   wrapping around coefficients of the product up from degree m >= m0.
   Assumes k >= l.
   R is assumed to have 2*m0-3+list_mul_mem(m0-1) allocated cells.
   Return m (or 0 if an error occurred).
*/
unsigned int
ks_wrapmul (listz_t R, unsigned int m0,
            listz_t A, unsigned int k,
            listz_t B, unsigned int l,
	    mpz_t n)
{
#ifndef FFT_WRAP
  ASSERT_ALWAYS(0); /* ks_wrapmul should not be called in that case */
  return 0;
#else
  unsigned long i, m, t;
  mp_size_t s, size_t0, size_t1, size_tmp;
  mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr, tp;

  ASSERT(k >= l);

  t = mpz_sizeinbase (n, 2);
  for (i = 0; i < k; i++)
    if (mpz_sgn (A[i]) < 0 || mpz_sizeinbase (A[i], 2) > t)
      mpz_mod (A[i], A[i], n);
  for (i = 0; i < l; i++)
    if (mpz_sgn (B[i]) < 0 || mpz_sizeinbase (B[i], 2) > t)
      mpz_mod (B[i], B[i], n);
  
  s = t * 2 + 1; /* one extra sign bit */
  for (i = k - 1; i; s++, i >>= 1);
  
  s = 1 + (s - 1) / GMP_NUMB_BITS;

  size_t0 = s * k;
  size_t1 = s * l;

  /* allocate one double-buffer to save malloc/MPN_ZERO/free calls */
  t0_ptr = (mp_ptr) malloc (size_t0 * sizeof (mp_limb_t));
  if (t0_ptr == NULL)
    return 0;
  t1_ptr = (mp_ptr) malloc (size_t1 * sizeof (mp_limb_t));
  if (t1_ptr == NULL)
    {
      free (t0_ptr);
      return 0;
    }
    
  MPN_ZERO (t0_ptr, size_t0);
  MPN_ZERO (t1_ptr, size_t1);

  for (i = 0; i < k; i++)
    if (SIZ(A[i]))
      MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i]));
  for (i = 0; i < l; i++)
    if (SIZ(B[i]))
      MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i]));

  i = mpn_mulmod_bnm1_next_size (m0 * s);
  /* the following loop ensures we don't cut in the middle of a
     coefficient */
  while (i % s)
    i = mpn_mulmod_bnm1_next_size (i + 1);
  ASSERT(i % s == 0);
  m = i / s;
  ASSERT(m <= 2 * m0 - 3 + list_mul_mem (m0 - 1));

  t2_ptr = (mp_ptr) malloc ((i + 1) * sizeof (mp_limb_t));
  if (t2_ptr == NULL)
    {
      free (t0_ptr);
      free (t1_ptr);
      return 0;
    }

  {
    mp_ptr tp = malloc ((2 * i + 4) * sizeof (mp_limb_t));
    if (tp == NULL)
      {
        free (t0_ptr);
        free (t1_ptr);
        return 0;
      }
    mpn_mulmod_bnm1 (t2_ptr, i, t0_ptr, size_t0, t1_ptr, size_t1, tp);
    if ((mp_size_t) i > size_t0 + size_t1)
      MPN_ZERO(t2_ptr + size_t0 + size_t1, i - (size_t0 + size_t1));
    free (tp);
  }

  for (t = 0, tp = t2_ptr; t < m; t++, tp += s)
    {
      size_tmp = s;
      MPN_NORMALIZE(tp, size_tmp);
      r_ptr = MPZ_REALLOC (R[t], size_tmp);
      if (size_tmp)
        MPN_COPY (r_ptr, tp, size_tmp);
      SIZ(R[t]) = size_tmp;
    }

  free (t0_ptr);
  free (t1_ptr);
  free (t2_ptr);
  
  return m;
#endif /* FFT_WRAP */
}