diff --git a/src/fp/relic_fp_smb.c b/src/fp/relic_fp_smb.c index 785e173c8..0ff9d5266 100644 --- a/src/fp/relic_fp_smb.c +++ b/src/fp/relic_fp_smb.c @@ -37,6 +37,129 @@ /* Private definitions */ /*============================================================================*/ +#if FP_SMB == BINAR || !defined(STRIP) +/** + * Approach heavily inpired in the blst implementation of the algorithm. + */ +static dig_t porninstep(dis_t m[4],const dig_t f[2], const dig_t g[2], + dig_t k, size_t s) { + dig_t limbx, ai = 1, bi = 0, ci = 0, di = 1; + dig_t g_lo = g[0], g_hi = g[1], f_lo = f[0], f_hi = f[1]; + dig_t t_lo, t_hi, odd, borrow, xorm; + + /* Unrolling twice gives some small speedup. */ + for (size_t i = 0; i < s; i+=2) { + odd = 0 - (g_lo & 1); + + /* g_ -= f_ if g_ is odd */ + t_lo = g_lo, t_hi = g_hi; + + borrow = 0; + limbx = g_lo - (f_lo & odd); + borrow = (g_lo < limbx); + g_lo = limbx; + + limbx = g_hi - (f_hi & odd); + xorm = limbx - borrow; + borrow = -((g_hi < limbx) || (borrow && !limbx)); + g_hi = xorm; + + k += ((t_lo & f_lo) >> 1) & borrow; + + /* negate g_-f_ if it borrowed */ + g_lo ^= borrow; + g_hi ^= borrow; + limbx = g_lo + (borrow & 1); + g_hi += (g_lo < limbx); + g_lo = limbx; + + /* f_=g_ if g_-f_ borrowed */ + f_lo = ((t_lo ^ f_lo) & borrow) ^ f_lo; + f_hi = ((t_hi ^ f_hi) & borrow) ^ f_hi; + + /* exchange ai and ci if g_-f_ borrowed */ + xorm = (ai ^ ci) & borrow; + ai ^= xorm; + ci ^= xorm; + + /* exchange bi and di if g_-f_ borrowed */ + xorm = (bi ^ di) & borrow; + bi ^= xorm; + di ^= xorm; + + /* subtract if g_ was odd */ + ai -= ci & odd; + bi -= di & odd; + + ci <<= 1; + di <<= 1; + g_lo >>= 1; + g_lo |= g_hi << (RLC_DIG - 1); + g_hi >>= 1; + + k += (f_lo + 2) >> 2; + + odd = 0 - (g_lo & 1); + + /* g_ -= f_ if g_ is odd */ + t_lo = g_lo, t_hi = g_hi; + + borrow = 0; + limbx = g_lo - (f_lo & odd); + borrow = (g_lo < limbx); + g_lo = limbx; + + limbx = g_hi - (f_hi & odd); + xorm = limbx - borrow; + borrow = -((g_hi < limbx) || (borrow && !limbx)); + g_hi = xorm; + + k += ((t_lo & f_lo) >> 1) & borrow; + + /* negate g_-f_ if it borrowed */ + g_lo ^= borrow; + g_hi ^= borrow; + limbx = g_lo + (borrow & 1); + g_hi += (g_lo < limbx); + g_lo = limbx; + + /* f_=g_ if g_-f_ borrowed */ + f_lo = ((t_lo ^ f_lo) & borrow) ^ f_lo; + f_hi = ((t_hi ^ f_hi) & borrow) ^ f_hi; + + /* exchange ai and ci if g_-f_ borrowed */ + xorm = (ai ^ ci) & borrow; + ai ^= xorm; + ci ^= xorm; + + /* exchange bi and di if g_-f_ borrowed */ + xorm = (bi ^ di) & borrow; + bi ^= xorm; + di ^= xorm; + + /* subtract if g_ was odd */ + ai -= ci & odd; + bi -= di & odd; + + ci <<= 1; + di <<= 1; + g_lo >>= 1; + g_lo |= g_hi << (RLC_DIG - 1); + g_hi >>= 1; + + k += (f_lo + 2) >> 2; + } + + m[0] = ai; + m[1] = bi; + m[2] = ci; + m[3] = di; + + return k; +} + +#endif + #if FP_SMB == JMPDS || !defined(STRIP) static dis_t jumpdivstep(dis_t m[4], dig_t *k, dis_t delta, dis_t y, dis_t x, @@ -147,198 +270,91 @@ int fp_smb_basic(const fp_t a) { #if FP_SMB == BINAR || !defined(STRIP) -static inline dig_t is_zero(dig_t l) { - l = ~l & (l - 1); - return (l >> (RLC_DIG - 1)); -} - -static dig_t lshift_2(dig_t hi, dig_t lo, size_t l) { - size_t r = RLC_DIG - l; - dig_t mask = 0 - (is_zero(l) ^ 1); - return (hi << (l & (RLC_DIG - 1))) | ((lo & mask) >> (r & (RLC_DIG - 1))); -} - -static void ab_approximation_n(dig_t a_[2], const dig_t a[], - dig_t b_[2], const dig_t b[]) { - dig_t a_hi, a_lo, b_hi, b_lo, mask; - size_t i; - - i = RLC_FP_DIGS - 1; - a_hi = a[i], a_lo = a[i - 1]; - b_hi = b[i], b_lo = b[i - 1]; - for (int j = i - 1; j >= 0; j--) { - mask = 0 - is_zero(a_hi | b_hi); - a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; - b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; - a_lo = ((a[j] ^ a_lo) & mask) ^ a_lo; - b_lo = ((b[j] ^ b_lo) & mask) ^ b_lo; - } - i = RLC_DIG - util_bits_dig(a_hi | b_hi); - /* |i| can be RLC_DIG if all a[2..]|b[2..] were zeros */ - - a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); - b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); -} - -static dig_t smul_n_shift_n(dig_t ret[], const dig_t a[], dig_t *f_, - const dig_t b[], dig_t *g_) { - dig_t a_[RLC_FP_DIGS + 1], b_[RLC_FP_DIGS + 1], f, g, neg, carry, hi; - size_t i; - - /* |a|*|f_| */ - f = *f_; - neg = 0 - RLC_SIGN(f); - f = (f ^ neg) - neg; /* ensure |f| is positive */ - bn_negs_low(a_, a, -neg, RLC_FP_DIGS); - hi = fp_mul1_low(a_, a_, f); - a_[RLC_FP_DIGS] = hi - (f & neg); - - /* |b|*|g_| */ - g = *g_; - neg = 0 - RLC_SIGN(g); - g = (g ^ neg) - neg; /* ensure |g| is positive */ - bn_negs_low(b_, b, -neg, RLC_FP_DIGS); - hi = fp_mul1_low(b_, b_, g); - b_[RLC_FP_DIGS] = hi - (g & neg); - - /* |a|*|f_| + |b|*|g_| */ - (void)bn_addn_low(a_, a_, b_, RLC_FP_DIGS + 1); - - /* (|a|*|f_| + |b|*|g_|) >> k */ - for (carry = a_[0], i = 0; i < RLC_FP_DIGS; i++) { - hi = carry >> (RLC_DIG - 2); - carry = a_[i + 1]; - ret[i] = hi | (carry << 2); - } - - /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ - neg = 0 - RLC_SIGN(carry); - *f_ = (*f_ ^ neg) - neg; - *g_ = (*g_ ^ neg) - neg; - bn_negs_low(ret, ret, -neg, RLC_FP_DIGS); - - return neg; -} - -/* - * Copy of inner_loop_n above, but with |L| updates. - */ -static dig_t legendre_loop_n(dig_t l, dig_t m[4], const dig_t a_[2], - const dig_t b_[2], size_t n) { - dig_t limbx, f0 = 1, g0 = 0, f1 = 0, g1 = 1; - dig_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; - - a_lo = a_[0], a_hi = a_[1]; - b_lo = b_[0], b_hi = b_[1]; - - while (n--) { - odd = 0 - (a_lo & 1); - - /* a_ -= b_ if a_ is odd */ - t_lo = a_lo, t_hi = a_hi; - - borrow = 0; - limbx = a_lo - (b_lo & odd); - borrow = (a_lo < limbx); - a_lo = limbx; - - limbx = a_hi - (b_hi & odd); - xorm = limbx - borrow; - borrow = -((a_hi < limbx) || (borrow && !limbx)); - a_hi = xorm; - - l += ((t_lo & b_lo) >> 1) & borrow; - - /* negate a_-b_ if it borrowed */ - a_lo ^= borrow; - a_hi ^= borrow; - limbx = a_lo + (borrow & 1); - a_hi += (a_lo < limbx); - a_lo = limbx; - - /* b_=a_ if a_-b_ borrowed */ - b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; - b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; - - /* exchange f0 and f1 if a_-b_ borrowed */ - xorm = (f0 ^ f1) & borrow; - f0 ^= xorm; - f1 ^= xorm; - - /* exchange g0 and g1 if a_-b_ borrowed */ - xorm = (g0 ^ g1) & borrow; - g0 ^= xorm; - g1 ^= xorm; - - /* subtract if a_ was odd */ - f0 -= f1 & odd; - g0 -= g1 & odd; - - f1 <<= 1; - g1 <<= 1; - a_lo >>= 1; - a_lo |= a_hi << (RLC_DIG - 1); - a_hi >>= 1; - - l += (b_lo + 2) >> 2; - } - - m[0] = f0; - m[1] = g0; - m[2] = f1; - m[3] = g1; - - return l; -} +#define RLC_LSH(H, L, I) \ + (H << I) | (L & -(I != 0)) >> ((RLC_DIG - I) & (RLC_DIG - 1)) int fp_smb_binar(const fp_t a) { const size_t s = RLC_DIG - 2; - dv_t x, y, t; - dig_t a_[2], b_[2], neg, l = 0, m[4]; - bn_t _t; + dv_t f, g, t, t0, t1; + dig_t g_[2], f_[2], neg, l, g_hi, g_lo, f_hi, f_lo, mask, k = 0; + dis_t m[4]; int iterations = 2 * RLC_FP_DIGS * RLC_DIG; if (fp_is_zero(a)) { return 0; } - bn_null(_t); - dv_null(x); - dv_null(y); + dv_null(f); + dv_null(g); dv_null(t); + dv_null(t0); + dv_null(t1); RLC_TRY { - bn_new(_t); - dv_new(x); - dv_new(y); + dv_new(f); + dv_new(g); dv_new(t); + dv_new(t0); + dv_new(t1); + + dv_zero(t, 2 * RLC_FP_DIGS); + dv_copy(f, fp_prime_get(), RLC_FP_DIGS); +#if FP_RDC == MONTY + /* Convert a from Montgomery form. */ + fp_copy(t, a); + fp_rdcn_low(g, t); +#else + fp_copy(g, a); +#endif - fp_prime_back(_t, a); - dv_zero(x, RLC_FP_DIGS); - dv_copy(x, _t->dp, _t->used); - dv_copy(y, fp_prime_get(), RLC_FP_DIGS); - - for (size_t i = 0; i < iterations / s; i++) { - ab_approximation_n(a_, x, b_, y); - l = legendre_loop_n(l, m, a_, b_, s); - neg = smul_n_shift_n(t, x, &m[0], y, &m[1]); - (void)smul_n_shift_n(y, x, &m[2], y, &m[3]); - fp_copy(x, t); - l += (y[0] >> 1) & neg; + for (size_t len, i = 0; i < iterations / s; i++) { + f_hi = f[RLC_FP_DIGS - 1], f_lo = f[RLC_FP_DIGS - 2]; + g_hi = g[RLC_FP_DIGS - 1], g_lo = g[RLC_FP_DIGS - 2]; + for (int j = RLC_FP_DIGS - 2; j >= 0; j--) { + l = (f_hi | g_hi); + l = ~l & (l - 1); + mask = -(l >> (RLC_DIG - 1)); + f_hi = ((f_lo ^ f_hi) & mask) ^ f_hi; + g_hi = ((g_lo ^ g_hi) & mask) ^ g_hi; + f_lo = ((f[j] ^ f_lo) & mask) ^ f_lo; + g_lo = ((g[j] ^ g_lo) & mask) ^ g_lo; + } + len = RLC_DIG - util_bits_dig(f_hi | g_hi); + f_[0] = f[0], f_[1] = RLC_LSH(f_hi, f_lo, len); + g_[0] = g[0], g_[1] = RLC_LSH(g_hi, g_lo, len); + + k = porninstep(m, f_, g_, k, s); + + t0[RLC_FP_DIGS] = bn_muls_low(t0, g, RLC_POS, m[0], RLC_FP_DIGS); + t1[RLC_FP_DIGS] = bn_muls_low(t1, f, RLC_POS, m[1], RLC_FP_DIGS); + bn_addn_low(t0, t0, t1, RLC_FP_DIGS + 1); + neg = RLC_SIGN(t0[RLC_FP_DIGS]); + bn_rshb_low(t, t0, RLC_FP_DIGS + 1, (RLC_DIG - 2)); + bn_negs_low(t, t, neg, RLC_FP_DIGS); + + t0[RLC_FP_DIGS] = bn_muls_low(t0, g, RLC_POS, m[2], RLC_FP_DIGS); + t1[RLC_FP_DIGS] = bn_muls_low(t1, f, RLC_POS, m[3], RLC_FP_DIGS); + bn_addn_low(t1, t1, t0, RLC_FP_DIGS + 1); + bn_rshb_low(f, t1, RLC_FP_DIGS + 1, (RLC_DIG - 2)); + bn_negs_low(f, f, RLC_SIGN(t1[RLC_FP_DIGS]), RLC_FP_DIGS); + + fp_copy(g, t); + k += (f[0] >> 1) & neg; } - l = legendre_loop_n(l, m, x, y, iterations % s); + k = porninstep(m, g, f, k, iterations % s); } RLC_CATCH_ANY { RLC_THROW(ERR_CAUGHT) } RLC_FINALLY { - bn_free(_t); - dv_free(x); - dv_free(y); + dv_free(f); + dv_free(g); dv_free(t); + dv_free(t0); + dv_free(t1); } - return (l & 1 ? -1 : 1); + return (k & 1 ? -1 : 1); } #endif