From 43ad9ef0398b847547f89d2e23894f92949b3192 Mon Sep 17 00:00:00 2001
From: "Diego F. Aranha" <dfaranha@gmail.com>
Date: Thu, 9 May 2024 18:36:20 +0200
Subject: [PATCH] Refactor Pornin's algorithm.

---
 src/fp/relic_fp_smb.c | 354 ++++++++++++++++++++++--------------------
 1 file changed, 185 insertions(+), 169 deletions(-)

diff --git a/src/fp/relic_fp_smb.c b/src/fp/relic_fp_smb.c
index 785e173c8..0ff9d5266 100644
--- a/src/fp/relic_fp_smb.c
+++ b/src/fp/relic_fp_smb.c
@@ -37,6 +37,129 @@
 /* Private definitions                                                        */
 /*============================================================================*/
 
+#if FP_SMB == BINAR || !defined(STRIP)
+/**
+ * Approach heavily inpired in the blst implementation of the algorithm.
+ */
+static dig_t porninstep(dis_t m[4],const dig_t f[2], const dig_t g[2], 
+		dig_t k, size_t s) {
+	dig_t limbx, ai = 1, bi = 0, ci = 0, di = 1;
+	dig_t g_lo = g[0], g_hi = g[1], f_lo = f[0], f_hi = f[1];
+	dig_t t_lo, t_hi, odd, borrow, xorm;
+
+	/* Unrolling twice gives some small speedup. */
+	for (size_t i = 0; i < s; i+=2) {
+		odd = 0 - (g_lo & 1);
+
+		/* g_ -= f_ if g_ is odd */
+		t_lo = g_lo, t_hi = g_hi;
+
+		borrow = 0;
+		limbx = g_lo - (f_lo & odd);
+		borrow = (g_lo < limbx);
+		g_lo = limbx;
+
+		limbx = g_hi - (f_hi & odd);
+		xorm = limbx - borrow;
+		borrow = -((g_hi < limbx) || (borrow && !limbx));
+		g_hi = xorm;
+
+		k += ((t_lo & f_lo) >> 1) & borrow;
+
+		/* negate g_-f_ if it borrowed */
+		g_lo ^= borrow;
+		g_hi ^= borrow;
+		limbx = g_lo + (borrow & 1);
+		g_hi += (g_lo < limbx);
+		g_lo = limbx;
+
+		/* f_=g_ if g_-f_ borrowed */
+		f_lo = ((t_lo ^ f_lo) & borrow) ^ f_lo;
+		f_hi = ((t_hi ^ f_hi) & borrow) ^ f_hi;
+
+		/* exchange ai and ci if g_-f_ borrowed */
+		xorm = (ai ^ ci) & borrow;
+		ai ^= xorm;
+		ci ^= xorm;
+
+		/* exchange bi and di if g_-f_ borrowed */
+		xorm = (bi ^ di) & borrow;
+		bi ^= xorm;
+		di ^= xorm;
+
+		/* subtract if g_ was odd */
+		ai -= ci & odd;
+		bi -= di & odd;
+
+		ci <<= 1;
+		di <<= 1;
+		g_lo >>= 1;
+		g_lo |= g_hi << (RLC_DIG - 1);
+		g_hi >>= 1;
+
+		k += (f_lo + 2) >> 2;
+
+		odd = 0 - (g_lo & 1);
+
+		/* g_ -= f_ if g_ is odd */
+		t_lo = g_lo, t_hi = g_hi;
+
+		borrow = 0;
+		limbx = g_lo - (f_lo & odd);
+		borrow = (g_lo < limbx);
+		g_lo = limbx;
+
+		limbx = g_hi - (f_hi & odd);
+		xorm = limbx - borrow;
+		borrow = -((g_hi < limbx) || (borrow && !limbx));
+		g_hi = xorm;
+
+		k += ((t_lo & f_lo) >> 1) & borrow;
+
+		/* negate g_-f_ if it borrowed */
+		g_lo ^= borrow;
+		g_hi ^= borrow;
+		limbx = g_lo + (borrow & 1);
+		g_hi += (g_lo < limbx);
+		g_lo = limbx;
+
+		/* f_=g_ if g_-f_ borrowed */
+		f_lo = ((t_lo ^ f_lo) & borrow) ^ f_lo;
+		f_hi = ((t_hi ^ f_hi) & borrow) ^ f_hi;
+
+		/* exchange ai and ci if g_-f_ borrowed */
+		xorm = (ai ^ ci) & borrow;
+		ai ^= xorm;
+		ci ^= xorm;
+
+		/* exchange bi and di if g_-f_ borrowed */
+		xorm = (bi ^ di) & borrow;
+		bi ^= xorm;
+		di ^= xorm;
+
+		/* subtract if g_ was odd */
+		ai -= ci & odd;
+		bi -= di & odd;
+
+		ci <<= 1;
+		di <<= 1;
+		g_lo >>= 1;
+		g_lo |= g_hi << (RLC_DIG - 1);
+		g_hi >>= 1;
+
+		k += (f_lo + 2) >> 2;
+	}
+
+	m[0] = ai;
+	m[1] = bi;
+	m[2] = ci;
+	m[3] = di;
+
+	return k;
+}
+
+#endif
+
 #if FP_SMB == JMPDS || !defined(STRIP)
 
 static dis_t jumpdivstep(dis_t m[4], dig_t *k, dis_t delta, dis_t y, dis_t x, 
@@ -147,198 +270,91 @@ int fp_smb_basic(const fp_t a) {
 
 #if FP_SMB == BINAR || !defined(STRIP)
 
-static inline dig_t is_zero(dig_t l) {
-	l = ~l & (l - 1);
-	return (l >> (RLC_DIG - 1));
-}
-
-static dig_t lshift_2(dig_t hi, dig_t lo, size_t l) {
-	size_t r = RLC_DIG - l;
-	dig_t mask = 0 - (is_zero(l) ^ 1);
-	return (hi << (l & (RLC_DIG - 1))) | ((lo & mask) >> (r & (RLC_DIG - 1)));
-}
-
-static void ab_approximation_n(dig_t a_[2], const dig_t a[],
-		dig_t b_[2], const dig_t b[]) {
-	dig_t a_hi, a_lo, b_hi, b_lo, mask;
-	size_t i;
-
-	i = RLC_FP_DIGS - 1;
-	a_hi = a[i], a_lo = a[i - 1];
-	b_hi = b[i], b_lo = b[i - 1];
-	for (int j = i - 1; j >= 0; j--) {
-		mask = 0 - is_zero(a_hi | b_hi);
-		a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi;
-		b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi;
-		a_lo = ((a[j] ^ a_lo) & mask) ^ a_lo;
-		b_lo = ((b[j] ^ b_lo) & mask) ^ b_lo;
-	}
-	i = RLC_DIG - util_bits_dig(a_hi | b_hi);
-	/* |i| can be RLC_DIG if all a[2..]|b[2..] were zeros */
-
-	a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i);
-	b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i);
-}
-
-static dig_t smul_n_shift_n(dig_t ret[], const dig_t a[], dig_t *f_,
-		const dig_t b[], dig_t *g_) {
-	dig_t a_[RLC_FP_DIGS + 1], b_[RLC_FP_DIGS + 1], f, g, neg, carry, hi;
-	size_t i;
-
-	/* |a|*|f_| */
-	f = *f_;
-	neg = 0 - RLC_SIGN(f);
-	f = (f ^ neg) - neg;		/* ensure |f| is positive */
-	bn_negs_low(a_, a, -neg, RLC_FP_DIGS);
-	hi = fp_mul1_low(a_, a_, f);
-	a_[RLC_FP_DIGS] = hi - (f & neg);
-
-	/* |b|*|g_| */
-	g = *g_;
-	neg = 0 - RLC_SIGN(g);
-	g = (g ^ neg) - neg;		/* ensure |g| is positive */
-	bn_negs_low(b_, b, -neg, RLC_FP_DIGS);
-	hi = fp_mul1_low(b_, b_, g);
-	b_[RLC_FP_DIGS] = hi - (g & neg);
-
-	/* |a|*|f_| + |b|*|g_| */
-	(void)bn_addn_low(a_, a_, b_, RLC_FP_DIGS + 1);
-
-	/* (|a|*|f_| + |b|*|g_|) >> k */
-	for (carry = a_[0], i = 0; i < RLC_FP_DIGS; i++) {
-		hi = carry >> (RLC_DIG - 2);
-		carry = a_[i + 1];
-		ret[i] = hi | (carry << 2);
-	}
-
-	/* ensure result is non-negative, fix up |f_| and |g_| accordingly */
-	neg = 0 - RLC_SIGN(carry);
-	*f_ = (*f_ ^ neg) - neg;
-	*g_ = (*g_ ^ neg) - neg;
-	bn_negs_low(ret, ret, -neg, RLC_FP_DIGS);
-
-	return neg;
-}
-
-/*
- * Copy of inner_loop_n above, but with |L| updates.
- */
-static dig_t legendre_loop_n(dig_t l, dig_t m[4], const dig_t a_[2],
-		const dig_t b_[2], size_t n) {
-	dig_t limbx, f0 = 1, g0 = 0, f1 = 0, g1 = 1;
-	dig_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm;
-
-	a_lo = a_[0], a_hi = a_[1];
-	b_lo = b_[0], b_hi = b_[1];
-
-	while (n--) {
-		odd = 0 - (a_lo & 1);
-
-		/* a_ -= b_ if a_ is odd */
-		t_lo = a_lo, t_hi = a_hi;
-
-		borrow = 0;
-		limbx = a_lo - (b_lo & odd);
-		borrow = (a_lo < limbx);
-		a_lo = limbx;
-
-		limbx = a_hi - (b_hi & odd);
-		xorm = limbx - borrow;
-		borrow = -((a_hi < limbx) || (borrow && !limbx));
-		a_hi = xorm;
-
-		l += ((t_lo & b_lo) >> 1) & borrow;
-
-		/* negate a_-b_ if it borrowed */
-		a_lo ^= borrow;
-		a_hi ^= borrow;
-		limbx = a_lo + (borrow & 1);
-		a_hi += (a_lo < limbx);
-		a_lo = limbx;
-
-		/* b_=a_ if a_-b_ borrowed */
-		b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo;
-		b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi;
-
-		/* exchange f0 and f1 if a_-b_ borrowed */
-		xorm = (f0 ^ f1) & borrow;
-		f0 ^= xorm;
-		f1 ^= xorm;
-
-		/* exchange g0 and g1 if a_-b_ borrowed */
-		xorm = (g0 ^ g1) & borrow;
-		g0 ^= xorm;
-		g1 ^= xorm;
-
-		/* subtract if a_ was odd */
-		f0 -= f1 & odd;
-		g0 -= g1 & odd;
-
-		f1 <<= 1;
-		g1 <<= 1;
-		a_lo >>= 1;
-		a_lo |= a_hi << (RLC_DIG - 1);
-		a_hi >>= 1;
-
-		l += (b_lo + 2) >> 2;
-	}
-
-	m[0] = f0;
-	m[1] = g0;
-	m[2] = f1;
-	m[3] = g1;
-
-	return l;
-}
+#define RLC_LSH(H, L, I)													\
+		(H << I) | (L & -(I != 0)) >> ((RLC_DIG - I) & (RLC_DIG - 1))
 
 int fp_smb_binar(const fp_t a) {
 	const size_t s = RLC_DIG - 2;
-	dv_t x, y, t;
-	dig_t a_[2], b_[2], neg, l = 0, m[4];
-	bn_t _t;
+	dv_t f, g, t, t0, t1;
+	dig_t g_[2], f_[2], neg, l, g_hi, g_lo, f_hi, f_lo, mask, k = 0;
+	dis_t m[4];
 	int iterations = 2 * RLC_FP_DIGS * RLC_DIG;
 
 	if (fp_is_zero(a)) {
 		return 0;
 	}
 
-	bn_null(_t);
-	dv_null(x);
-	dv_null(y);
+	dv_null(f);
+	dv_null(g);
 	dv_null(t);
+	dv_null(t0);
+	dv_null(t1);
 
 	RLC_TRY {
-		bn_new(_t);
-		dv_new(x);
-		dv_new(y);
+		dv_new(f);
+		dv_new(g);
 		dv_new(t);
+		dv_new(t0);
+		dv_new(t1);
+
+		dv_zero(t, 2 * RLC_FP_DIGS);
+		dv_copy(f, fp_prime_get(), RLC_FP_DIGS);
+#if FP_RDC == MONTY
+		/* Convert a from Montgomery form. */
+		fp_copy(t, a);
+		fp_rdcn_low(g, t);
+#else
+		fp_copy(g, a);
+#endif
 
-		fp_prime_back(_t, a);
-		dv_zero(x, RLC_FP_DIGS);
-		dv_copy(x, _t->dp, _t->used);
-		dv_copy(y, fp_prime_get(), RLC_FP_DIGS);
-
-		for (size_t i = 0; i < iterations / s; i++) {
-			ab_approximation_n(a_, x, b_, y);
-			l = legendre_loop_n(l, m, a_, b_, s);
-			neg = smul_n_shift_n(t, x, &m[0], y, &m[1]);
-			(void)smul_n_shift_n(y, x, &m[2], y, &m[3]);
-			fp_copy(x, t);
-			l += (y[0] >> 1) & neg;
+		for (size_t len, i = 0; i < iterations / s; i++) {
+			f_hi = f[RLC_FP_DIGS - 1], f_lo = f[RLC_FP_DIGS - 2];
+			g_hi = g[RLC_FP_DIGS - 1], g_lo = g[RLC_FP_DIGS - 2];
+			for (int j = RLC_FP_DIGS - 2; j >= 0; j--) {
+				l = (f_hi | g_hi);
+				l = ~l & (l - 1);
+				mask = -(l >> (RLC_DIG - 1));
+				f_hi = ((f_lo ^ f_hi) & mask) ^ f_hi;
+				g_hi = ((g_lo ^ g_hi) & mask) ^ g_hi;
+				f_lo = ((f[j] ^ f_lo) & mask) ^ f_lo;
+				g_lo = ((g[j] ^ g_lo) & mask) ^ g_lo;
+			}
+			len = RLC_DIG - util_bits_dig(f_hi | g_hi);
+			f_[0] = f[0], f_[1] = RLC_LSH(f_hi, f_lo, len);
+			g_[0] = g[0], g_[1] = RLC_LSH(g_hi, g_lo, len);
+
+			k = porninstep(m, f_, g_, k, s);
+
+			t0[RLC_FP_DIGS] = bn_muls_low(t0, g, RLC_POS, m[0], RLC_FP_DIGS);
+			t1[RLC_FP_DIGS] = bn_muls_low(t1, f, RLC_POS, m[1], RLC_FP_DIGS);
+			bn_addn_low(t0, t0, t1, RLC_FP_DIGS + 1);
+			neg = RLC_SIGN(t0[RLC_FP_DIGS]);
+			bn_rshb_low(t, t0, RLC_FP_DIGS + 1, (RLC_DIG - 2));
+			bn_negs_low(t, t, neg, RLC_FP_DIGS);
+
+			t0[RLC_FP_DIGS] = bn_muls_low(t0, g, RLC_POS, m[2], RLC_FP_DIGS);
+			t1[RLC_FP_DIGS] = bn_muls_low(t1, f, RLC_POS, m[3], RLC_FP_DIGS);
+			bn_addn_low(t1, t1, t0, RLC_FP_DIGS + 1);
+			bn_rshb_low(f, t1, RLC_FP_DIGS + 1, (RLC_DIG - 2));
+			bn_negs_low(f, f, RLC_SIGN(t1[RLC_FP_DIGS]), RLC_FP_DIGS);
+
+			fp_copy(g, t);
+			k += (f[0] >> 1) & neg;
 		}
 
-		l = legendre_loop_n(l, m, x, y, iterations % s);
+		k = porninstep(m, g, f, k, iterations % s);
 
 	} RLC_CATCH_ANY {
 		RLC_THROW(ERR_CAUGHT)
 	} RLC_FINALLY {
-		bn_free(_t);
-		dv_free(x);
-		dv_free(y);
+		dv_free(f);
+		dv_free(g);
 		dv_free(t);
+		dv_free(t0);
+		dv_free(t1);
 	}
 
-	return (l & 1 ? -1 : 1);
+	return (k & 1 ? -1 : 1);
 }
 
 #endif