From 29636b6e71b7ada46aace655b43748dee9dd5c18 Mon Sep 17 00:00:00 2001 From: 0T34 <0T34@protonmail.com> Date: Tue, 25 Jan 2022 17:35:17 -0300 Subject: [PATCH] Remove insecure optimizations --- Makefile | 26 +- config.h | 264 ------- fft.c | 694 ---------------- fpr.c | 1832 ------------------------------------------- fpr.h | 447 ----------- inner.h | 184 ----- rng.c | 102 --- shake.c | 589 -------------- sign.c | 260 ------ tests/test_falcon.c | 5 - 10 files changed, 13 insertions(+), 4390 deletions(-) delete mode 100644 config.h diff --git a/Makefile b/Makefile index a8e728c..c28e9ad 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,7 @@ # CC C compiler; GCC or Clang are fine; MSVC (2015+) works too. # CFLAGS Compilation flags: # * Optimization level -O2 or higher is recommended -# See config.h for some possible configuration macros. +# See for some possible configuration macros. # LD Linker; normally the same command as the compiler. # LDFLAGS Linker options, not counting the extra libs. # LIBS Extra libraries for linking: @@ -67,44 +67,44 @@ tests/test_falcon: tests/test_falcon.o $(OBJ) tests/speed: tests/speed.o $(OBJ) $(LD) $(LDFLAGS) -o tests/speed tests/speed.o $(OBJ) $(LIBS) -codec.o: codec.c config.h inner.h fpr.h +codec.o: codec.c inner.h fpr.h $(CC) $(CFLAGS) -c -o codec.o codec.c -common.o: common.c config.h inner.h fpr.h +common.o: common.c inner.h fpr.h $(CC) $(CFLAGS) -c -o common.o common.c deterministic.o: deterministic.c deterministic.h falcon.h $(CC) $(CFLAGS) -c -o deterministic.o deterministic.c -falcon.o: falcon.c falcon.h config.h inner.h fpr.h +falcon.o: falcon.c falcon.h inner.h fpr.h $(CC) $(CFLAGS) -c -o falcon.o falcon.c -fft.o: fft.c config.h inner.h fpr.h +fft.o: fft.c inner.h fpr.h $(CC) $(CFLAGS) -c -o fft.o fft.c -fpr.o: fpr.c config.h inner.h fpr.h +fpr.o: fpr.c inner.h fpr.h $(CC) $(CFLAGS) -c -o fpr.o fpr.c -keygen.o: keygen.c config.h inner.h fpr.h +keygen.o: keygen.c inner.h fpr.h $(CC) $(CFLAGS) -c -o keygen.o keygen.c -rng.o: rng.c config.h inner.h fpr.h +rng.o: rng.c inner.h fpr.h $(CC) $(CFLAGS) -c -o rng.o rng.c -shake.o: shake.c config.h inner.h fpr.h +shake.o: shake.c inner.h fpr.h $(CC) $(CFLAGS) -c -o shake.o shake.c -sign.o: sign.c config.h inner.h fpr.h +sign.o: sign.c inner.h fpr.h $(CC) $(CFLAGS) -c -o sign.o sign.c tests/speed.o: tests/speed.c falcon.h $(CC) $(CFLAGS) -c -o tests/speed.o tests/speed.c -tests/test_falcon.o: tests/test_falcon.c falcon.h config.h inner.h fpr.h +tests/test_falcon.o: tests/test_falcon.c falcon.h inner.h fpr.h $(CC) $(CFLAGS) -c -o tests/test_falcon.o tests/test_falcon.c -tests/test_deterministic.o: tests/test_deterministic.c deterministic.h falcon.h config.h inner.h fpr.h +tests/test_deterministic.o: tests/test_deterministic.c deterministic.h falcon.h inner.h fpr.h $(CC) $(CFLAGS) -c -o tests/test_deterministic.o tests/test_deterministic.c -vrfy.o: vrfy.c config.h inner.h fpr.h +vrfy.o: vrfy.c inner.h fpr.h $(CC) $(CFLAGS) -c -o vrfy.o vrfy.c diff --git a/config.h b/config.h deleted file mode 100644 index ee2f083..0000000 --- a/config.h +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Manual configuration file for the Falcon implementation. Here can - * be set some compilation-time options. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2017-2019 Falcon Project - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#ifndef FALCON_CONFIG_H__ -#define FALCON_CONFIG_H__ - -/* - * Each option is a macro which should be defined to either 1 or 0. - * If any of the options below is left undefined, then a default value - * will be used by the code, possibly using compile-time autodetection - * from compiler-defined macros. - * - * Explicitly setting a parameter can be done by uncommenting/modifying - * its definition below, in this file, or equivalently by setting it as - * a compiler flag. - */ - -/* - * Emulated floating-point implementation. - * - * Emulation uses only integer operations with uint32_t and uint64_t - * types. This is constant-time, provided that the underlying platform - * offers constant-time opcodes for the following operations: - * - * - Multiplication of two 32-bit unsigned integers into a 64-bit result. - * - Left-shift or right-shift of a 32-bit unsigned integer by a - * potentially secret shift count in the 0..31 range. - * - * Notably, the ARM Cortex M3 does not fulfill the first condition, - * while the Pentium IV does not fulfill the second. - * - */ - -/* - * *** CRITICAL SECURITY WARNING ***: - * - * Here floating-point emulation is enabled in order to get reliable - * deterministic signing, because native floating-point units and code - * optimizations may yield slight discrepancies that could affect - * determinism. - * - * KEEPING FALCON_FPEMU ENABLED IS STRONGLY RECOMMENDED! Emulation may - * be needed for obtaining truly deterministic signing across - * different platforms and configurations, i.e., the same message - * should always yield the same signature (under the same secret key). - * - * Any non-determinism in signing can lead to a CATASTROPHIC SECURITY - * FAILURE, potentially enabling an attacker to create forgeries for - * arbitrary messages after obtaining two or more different signatures - * for the same message (under the same secret key). - * - * Determinism can be sanity-checked (but not guaranteed) using the - * provided KATs, via test_deterministic. Any deviation from the - * expected results indicates a lack of the desired determinism; - * however, agreement does not prove determinism for all possible - * inputs. - */ -#define FALCON_FPEMU 1 - - -/* - * Native 'double' C type for floating-point computations. Exact - * reproducibility of all tests requires that type to faithfully follow - * IEEE-754 "round-to-nearest" rules. - * - * Native double support will use the CPU hardware and/or - * compiler-provided functions; the latter is typically NOT - * constant-time, while the former MAY be constant-time, or not. On - * recent x86 CPU in 64-bit mode, SSE2 opcodes are used and they provide - * constant-time operations for all the operations used in Falcon, - * except for some special cases of divisions and square roots, but it - * can be shown that theses cases imply only negligible leak of - * information that cannot be leveraged into a full attack. - * - * If neither FALCON_FPNATIVE nor FALCON_FPEMU is defined, then use of - * the native 'double' C type is the default behaviour unless - * FALCON_ASM_CORTEXM4 is defined to 1, in which case the emulated code - * will be used. - * - */ - -/* - * For determinism, explicitly disable native floating-point - * operations. (These are already implicitly disabled by enabling - * FALCON_FPEMU above; here it is made explicit as a defensive - * measure.) - */ -#define FALCON_FPNATIVE 0 - - -/* - * Assembly for ARM Cortex-M4 CPU: by default, such support will be - * used based on some autodection on the compiler version and target - * architecture. Define this variable to 1 to force use of the - * assembly code, or 0 to disable it regardless of the autodetection. - * - * When FALCON_ASM_CORTEXM4 is enabled (whether defined explicitly or - * autodetected), emulated floating-point code will be used, unless - * FALCON_FPNATIVE or FALCON_FPEMU is explicitly set to override the - * choice. Emulated code with ARM assembly is constant-time and provides - * better performance than emulated code with plain C. - * - * The assembly code for the M4 can also work on a Cortex-M3. If the - * compiler is instructed to target the M3 (e.g. '-mcpu=cortex-m3' with - * GCC) then FALCON_ASM_CORTEXM4 won't be autodetected, but it can be - * enabled explicitly. Take care, though, that the M3 multiplication - * opcode (multiplication of two 32-bit unsigned integers with a 64-bit - * result) is NOT constant-time. - * - */ - -/* - * Explicitly disable the specialized assembly code for ARM Cortex-M4. - * - * While we are not aware of any way that the assembly code could lead - * to non-determinism, caution should be exercised if it is ever under - * consideration for usage. (At minimum, check KATs.) - */ -#define FALCON_ASM_CORTEXM4 0 - - -/* - * AVX2 intrinsics: if enabled, then the code will compile only when - * targeting x86 with a compiler that supports AVX2 intrinsics (tested - * with GCC 7.4.0, Clang 6.0.0, and MSVC 2015, both in 32-bit and - * 64-bit modes), and run only on systems that offer the AVX2 - * opcodes. Some operations leverage AVX2 for better performance. - * - */ - -/* - * Explicitly disable AVX2 optimizations. (These are already - * implicitly disabled by enabling FALCON_FPEMU above; here it is made - * explicit as a defensive measure.) - * - * While we are not aware of any way that AVX2 optimizations could - * lead to non-determinism, caution should be exercised if they are - * ever under consideration for usage. (At minimum, check KATs.) - */ -#define FALCON_AVX2 0 - - -/* - * FMA intrinsics: this setting has any effect only if FALCON_AVX2 is - * also enabled. The FMA intrinsics are normally available on any x86 - * CPU that also has AVX2. Note that setting this option will slightly - * modify the values of expanded private keys, but will normally not - * change the values of non-expanded private keys, public keys or - * signatures, for a given keygen/sign seed (non-expanded private keys - * and signatures might theoretically change, but only with low - * probability, less than 2^(-40); produced signatures are still safe - * and interoperable). - * - */ - -/* - * Explicitly disable FMA intrinsics, which (as mentioned above) are - * KNOWN to lead to non-determinism in some cases. (These are already - * implicitly disabled by enabling FALCON_FPEMU above; here it is made - * explicit as a defensive measure.) - * - * CRITICAL SECURITY WARNING: it is strongly recommended *NOT* to - * enable both FALCON_FPU and FALCON_FMA. Doing so is *KNOWN* to lead - * to non-determinism in some cases. - */ -#define FALCON_FMA 0 - - -/* - * Assert that the platform uses little-endian encoding. If enabled, - * then encoding and decoding of aligned multibyte values will be - * slightly faster (especially for hashing and random number - * generation). If not defined explicitly, then autodetection is - * applied. - * -#define FALCON_LE 1 - */ - -/* - * Assert that the platform tolerates accesses to unaligned multibyte - * values. If enabled, then some operations are slightly faster. Note - * that ARM Cortex M4 do _not_ fully tolerate unaligned accesses; for - * such systems, this option should not be enabled. If not defined - * explicitly, then autodetection is applied. - * -#define FALCON_UNALIGNED 1 - */ - -/* - * Use a PRNG based on ChaCha20 and seeded with SHAKE256, instead of - * SHAKE256 directly, for key pair generation purposes. This speeds up - * key pair generation, especially on platforms where SHAKE256 is - * comparatively slow: on the ARM Cortex M4, average key generation time - * is reduced by 19% with this setting; on a recent x86 Skylake, the - * reduction is smaller (less than 8%). - * - * However, this setting changes the private/public key pair obtained - * from a given seed, thus preventing reproducibility of the - * known-answer tests vectors. For compatibility with existing KAT - * vectors (e.g. in PQClean, pqm4 and NIST implementations), this - * setting is not enabled by default. - * -#define FALCON_KG_CHACHA20 1 - */ - -/* - * Use an explicit OS-provided source of randomness for seeding (for the - * Zf(get_seed)() function implementation). Three possible sources are - * defined: - * - * - getentropy() system call - * - /dev/urandom special file - * - CryptGenRandom() function call - * - * More than one source may be enabled, in which case they will be tried - * in the order above, until a success is reached. - * - * By default, sources are enabled at compile-time based on these - * conditions: - * - * - getentropy(): target is one of: Linux with Glibc-2.25+, FreeBSD 12+, - * or OpenBSD. - * - /dev/urandom: target is a Unix-like system (including Linux, - * FreeBSD, NetBSD, OpenBSD, DragonFly, macOS, Android, Solaris, AIX). - * - CryptGenRandom(): target is Windows (Win32 or Win64). - * - * On most small embedded systems, none will be enabled and Zf(get_seed)() - * will always return 0. Applications will need to provide their own seeds. - * -#define FALCON_RAND_GETENTROPY 1 -#define FALCON_RAND_URANDOM 1 -#define FALCON_RAND_WIN32 1 - */ - -#endif diff --git a/fft.c b/fft.c index b1904b2..564d96f 100644 --- a/fft.c +++ b/fft.c @@ -167,7 +167,6 @@ */ /* see inner.h */ -TARGET_AVX2 void Zf(FFT)(fpr *f, unsigned logn) { @@ -226,56 +225,6 @@ Zf(FFT)(fpr *f, unsigned logn) size_t j, j2; j2 = j1 + ht; -#if FALCON_AVX2 // yyyAVX2+1 - if (ht >= 4) { - __m256d s_re, s_im; - - s_re = _mm256_set1_pd( - fpr_gm_tab[((m + i1) << 1) + 0].v); - s_im = _mm256_set1_pd( - fpr_gm_tab[((m + i1) << 1) + 1].v); - for (j = j1; j < j2; j += 4) { - __m256d x_re, x_im, y_re, y_im; - __m256d z_re, z_im; - - x_re = _mm256_loadu_pd(&f[j].v); - x_im = _mm256_loadu_pd(&f[j + hn].v); - z_re = _mm256_loadu_pd(&f[j+ht].v); - z_im = _mm256_loadu_pd(&f[j+ht + hn].v); - y_re = FMSUB(z_re, s_re, - _mm256_mul_pd(z_im, s_im)); - y_im = FMADD(z_re, s_im, - _mm256_mul_pd(z_im, s_re)); - _mm256_storeu_pd(&f[j].v, - _mm256_add_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + hn].v, - _mm256_add_pd(x_im, y_im)); - _mm256_storeu_pd(&f[j + ht].v, - _mm256_sub_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + ht + hn].v, - _mm256_sub_pd(x_im, y_im)); - } - } else { - fpr s_re, s_im; - - s_re = fpr_gm_tab[((m + i1) << 1) + 0]; - s_im = fpr_gm_tab[((m + i1) << 1) + 1]; - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + ht]; - y_im = f[j + ht + hn]; - FPC_MUL(y_re, y_im, - y_re, y_im, s_re, s_im); - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(f[j + ht], f[j + ht + hn], - x_re, x_im, y_re, y_im); - } - } -#else // yyyAVX2+0 fpr s_re, s_im; s_re = fpr_gm_tab[((m + i1) << 1) + 0]; @@ -293,14 +242,12 @@ Zf(FFT)(fpr *f, unsigned logn) FPC_SUB(f[j + ht], f[j + ht + hn], x_re, x_im, y_re, y_im); } -#endif // yyyAVX2- } t = ht; } } /* see inner.h */ -TARGET_AVX2 void Zf(iFFT)(fpr *f, unsigned logn) { @@ -361,56 +308,6 @@ Zf(iFFT)(fpr *f, unsigned logn) size_t j, j2; j2 = j1 + t; -#if FALCON_AVX2 // yyyAVX2+1 - if (t >= 4) { - __m256d s_re, s_im; - - s_re = _mm256_set1_pd( - fpr_gm_tab[((hm + i1) << 1) + 0].v); - s_im = _mm256_set1_pd( - fpr_gm_tab[((hm + i1) << 1) + 1].v); - for (j = j1; j < j2; j += 4) { - __m256d x_re, x_im, y_re, y_im; - __m256d z_re, z_im; - - x_re = _mm256_loadu_pd(&f[j].v); - x_im = _mm256_loadu_pd(&f[j + hn].v); - y_re = _mm256_loadu_pd(&f[j+t].v); - y_im = _mm256_loadu_pd(&f[j+t + hn].v); - _mm256_storeu_pd(&f[j].v, - _mm256_add_pd(x_re, y_re)); - _mm256_storeu_pd(&f[j + hn].v, - _mm256_add_pd(x_im, y_im)); - x_re = _mm256_sub_pd(y_re, x_re); - x_im = _mm256_sub_pd(x_im, y_im); - z_re = FMSUB(x_im, s_im, - _mm256_mul_pd(x_re, s_re)); - z_im = FMADD(x_re, s_im, - _mm256_mul_pd(x_im, s_re)); - _mm256_storeu_pd(&f[j+t].v, z_re); - _mm256_storeu_pd(&f[j+t + hn].v, z_im); - } - } else { - fpr s_re, s_im; - - s_re = fpr_gm_tab[((hm + i1) << 1)+0]; - s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]); - for (j = j1; j < j2; j ++) { - fpr x_re, x_im, y_re, y_im; - - x_re = f[j]; - x_im = f[j + hn]; - y_re = f[j + t]; - y_im = f[j + t + hn]; - FPC_ADD(f[j], f[j + hn], - x_re, x_im, y_re, y_im); - FPC_SUB(x_re, x_im, - x_re, x_im, y_re, y_im); - FPC_MUL(f[j + t], f[j + t + hn], - x_re, x_im, s_re, s_im); - } - } -#else // yyyAVX2+0 fpr s_re, s_im; s_re = fpr_gm_tab[((hm + i1) << 1) + 0]; @@ -428,7 +325,6 @@ Zf(iFFT)(fpr *f, unsigned logn) FPC_MUL(f[j + t], f[j + t + hn], x_re, x_im, s_re, s_im); } -#endif // yyyAVX2- } t = dt; m = hm; @@ -449,7 +345,6 @@ Zf(iFFT)(fpr *f, unsigned logn) } /* see inner.h */ -TARGET_AVX2 void Zf(poly_add)( fpr *restrict a, const fpr *restrict b, unsigned logn) @@ -457,28 +352,12 @@ Zf(poly_add)( size_t n, u; n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_add_pd( - _mm256_loadu_pd(&a[u].v), - _mm256_loadu_pd(&b[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_add(a[u], b[u]); - } - } -#else // yyyAVX2+0 for (u = 0; u < n; u ++) { a[u] = fpr_add(a[u], b[u]); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_sub)( fpr *restrict a, const fpr *restrict b, unsigned logn) @@ -486,86 +365,36 @@ Zf(poly_sub)( size_t n, u; n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_sub_pd( - _mm256_loadu_pd(&a[u].v), - _mm256_loadu_pd(&b[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_sub(a[u], b[u]); - } - } -#else // yyyAVX2+0 for (u = 0; u < n; u ++) { a[u] = fpr_sub(a[u], b[u]); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_neg)(fpr *a, unsigned logn) { size_t n, u; n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - __m256d s; - - s = _mm256_set1_pd(-0.0); - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s)); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_neg(a[u]); - } - } -#else // yyyAVX2+0 for (u = 0; u < n; u ++) { a[u] = fpr_neg(a[u]); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_adj_fft)(fpr *a, unsigned logn) { size_t n, u; n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d s; - - s = _mm256_set1_pd(-0.0); - for (u = (n >> 1); u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s)); - } - } else { - for (u = (n >> 1); u < n; u ++) { - a[u] = fpr_neg(a[u]); - } - } -#else // yyyAVX2+0 for (u = (n >> 1); u < n; u ++) { a[u] = fpr_neg(a[u]); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_mul_fft)( fpr *restrict a, const fpr *restrict b, unsigned logn) @@ -574,34 +403,6 @@ Zf(poly_mul_fft)( n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - c_re = FMSUB( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMADD( - a_re, b_im, _mm256_mul_pd(a_im, b_re)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr a_re, a_im, b_re, b_im; @@ -611,11 +412,9 @@ Zf(poly_mul_fft)( b_im = b[u + hn]; FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_muladj_fft)( fpr *restrict a, const fpr *restrict b, unsigned logn) @@ -624,34 +423,6 @@ Zf(poly_muladj_fft)( n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - c_re = FMADD( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMSUB( - a_im, b_re, _mm256_mul_pd(a_re, b_im)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = fpr_neg(b[u + hn]); - FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr a_re, a_im, b_re, b_im; @@ -661,11 +432,9 @@ Zf(poly_muladj_fft)( b_im = fpr_neg(b[u + hn]); FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn) { @@ -677,32 +446,6 @@ Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn) n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d zero; - - zero = _mm256_setzero_pd(); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - _mm256_storeu_pd(&a[u].v, - FMADD(a_re, a_re, - _mm256_mul_pd(a_im, a_im))); - _mm256_storeu_pd(&a[u + hn].v, zero); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - - a_re = a[u]; - a_im = a[u + hn]; - a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)); - a[u + hn] = fpr_zero; - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr a_re, a_im; @@ -711,40 +454,21 @@ Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn) a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)); a[u + hn] = fpr_zero; } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn) { size_t n, u; n = (size_t)1 << logn; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 4) { - __m256d x4; - - x4 = _mm256_set1_pd(x.v); - for (u = 0; u < n; u += 4) { - _mm256_storeu_pd(&a[u].v, - _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v))); - } - } else { - for (u = 0; u < n; u ++) { - a[u] = fpr_mul(a[u], x); - } - } -#else // yyyAVX2+0 for (u = 0; u < n; u ++) { a[u] = fpr_mul(a[u], x); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_div_fft)( fpr *restrict a, const fpr *restrict b, unsigned logn) @@ -753,42 +477,6 @@ Zf(poly_div_fft)( n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im, t; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - t = _mm256_div_pd(one, - FMADD(b_re, b_re, - _mm256_mul_pd(b_im, b_im))); - b_re = _mm256_mul_pd(b_re, t); - b_im = _mm256_mul_pd(b_im, t); - c_re = FMADD( - a_re, b_re, _mm256_mul_pd(a_im, b_im)); - c_im = FMSUB( - a_im, b_re, _mm256_mul_pd(a_re, b_im)); - _mm256_storeu_pd(&a[u].v, c_re); - _mm256_storeu_pd(&a[u + hn].v, c_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im, b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr a_re, a_im, b_re, b_im; @@ -798,11 +486,9 @@ Zf(poly_div_fft)( b_im = b[u + hn]; FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_invnorm2_fft)(fpr *restrict d, const fpr *restrict a, const fpr *restrict b, unsigned logn) @@ -811,41 +497,6 @@ Zf(poly_invnorm2_fft)(fpr *restrict d, n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, b_re, b_im, dv; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - b_re = _mm256_loadu_pd(&b[u].v); - b_im = _mm256_loadu_pd(&b[u + hn].v); - dv = _mm256_div_pd(one, - _mm256_add_pd( - FMADD(a_re, a_re, - _mm256_mul_pd(a_im, a_im)), - FMADD(b_re, b_re, - _mm256_mul_pd(b_im, b_im)))); - _mm256_storeu_pd(&d[u].v, dv); - } - } else { - for (u = 0; u < hn; u ++) { - fpr a_re, a_im; - fpr b_re, b_im; - - a_re = a[u]; - a_im = a[u + hn]; - b_re = b[u]; - b_im = b[u + hn]; - d[u] = fpr_inv(fpr_add( - fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)), - fpr_add(fpr_sqr(b_re), fpr_sqr(b_im)))); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr a_re, a_im; fpr b_re, b_im; @@ -858,11 +509,9 @@ Zf(poly_invnorm2_fft)(fpr *restrict d, fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)), fpr_add(fpr_sqr(b_re), fpr_sqr(b_im)))); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_add_muladj_fft)(fpr *restrict d, const fpr *restrict F, const fpr *restrict G, @@ -872,57 +521,6 @@ Zf(poly_add_muladj_fft)(fpr *restrict d, n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d F_re, F_im, G_re, G_im; - __m256d f_re, f_im, g_re, g_im; - __m256d a_re, a_im, b_re, b_im; - - F_re = _mm256_loadu_pd(&F[u].v); - F_im = _mm256_loadu_pd(&F[u + hn].v); - G_re = _mm256_loadu_pd(&G[u].v); - G_im = _mm256_loadu_pd(&G[u + hn].v); - f_re = _mm256_loadu_pd(&f[u].v); - f_im = _mm256_loadu_pd(&f[u + hn].v); - g_re = _mm256_loadu_pd(&g[u].v); - g_im = _mm256_loadu_pd(&g[u + hn].v); - - a_re = FMADD(F_re, f_re, - _mm256_mul_pd(F_im, f_im)); - a_im = FMSUB(F_im, f_re, - _mm256_mul_pd(F_re, f_im)); - b_re = FMADD(G_re, g_re, - _mm256_mul_pd(G_im, g_im)); - b_im = FMSUB(G_im, g_re, - _mm256_mul_pd(G_re, g_im)); - _mm256_storeu_pd(&d[u].v, - _mm256_add_pd(a_re, b_re)); - _mm256_storeu_pd(&d[u + hn].v, - _mm256_add_pd(a_im, b_im)); - } - } else { - for (u = 0; u < hn; u ++) { - fpr F_re, F_im, G_re, G_im; - fpr f_re, f_im, g_re, g_im; - fpr a_re, a_im, b_re, b_im; - - F_re = F[u]; - F_im = F[u + hn]; - G_re = G[u]; - G_im = G[u + hn]; - f_re = f[u]; - f_im = f[u + hn]; - g_re = g[u]; - g_im = g[u + hn]; - - FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im)); - FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im)); - d[u] = fpr_add(a_re, b_re); - d[u + hn] = fpr_add(a_im, b_im); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr F_re, F_im, G_re, G_im; fpr f_re, f_im, g_re, g_im; @@ -942,11 +540,9 @@ Zf(poly_add_muladj_fft)(fpr *restrict d, d[u] = fpr_add(a_re, b_re); d[u + hn] = fpr_add(a_im, b_im); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_mul_autoadj_fft)( fpr *restrict a, const fpr *restrict b, unsigned logn) @@ -955,35 +551,13 @@ Zf(poly_mul_autoadj_fft)( n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - for (u = 0; u < hn; u += 4) { - __m256d a_re, a_im, bv; - - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - bv = _mm256_loadu_pd(&b[u].v); - _mm256_storeu_pd(&a[u].v, - _mm256_mul_pd(a_re, bv)); - _mm256_storeu_pd(&a[u + hn].v, - _mm256_mul_pd(a_im, bv)); - } - } else { - for (u = 0; u < hn; u ++) { - a[u] = fpr_mul(a[u], b[u]); - a[u + hn] = fpr_mul(a[u + hn], b[u]); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { a[u] = fpr_mul(a[u], b[u]); a[u + hn] = fpr_mul(a[u + hn], b[u]); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_div_autoadj_fft)( fpr *restrict a, const fpr *restrict b, unsigned logn) @@ -992,30 +566,6 @@ Zf(poly_div_autoadj_fft)( n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d ib, a_re, a_im; - - ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v)); - a_re = _mm256_loadu_pd(&a[u].v); - a_im = _mm256_loadu_pd(&a[u + hn].v); - _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib)); - _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib)); - } - } else { - for (u = 0; u < hn; u ++) { - fpr ib; - - ib = fpr_inv(b[u]); - a[u] = fpr_mul(a[u], ib); - a[u + hn] = fpr_mul(a[u + hn], ib); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr ib; @@ -1023,11 +573,9 @@ Zf(poly_div_autoadj_fft)( a[u] = fpr_mul(a[u], ib); a[u + hn] = fpr_mul(a[u + hn], ib); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_LDL_fft)( const fpr *restrict g00, @@ -1037,63 +585,6 @@ Zf(poly_LDL_fft)( n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - __m256d t, mu_re, mu_im, xi_re, xi_im; - - g00_re = _mm256_loadu_pd(&g00[u].v); - g00_im = _mm256_loadu_pd(&g00[u + hn].v); - g01_re = _mm256_loadu_pd(&g01[u].v); - g01_im = _mm256_loadu_pd(&g01[u + hn].v); - g11_re = _mm256_loadu_pd(&g11[u].v); - g11_im = _mm256_loadu_pd(&g11[u + hn].v); - - t = _mm256_div_pd(one, - FMADD(g00_re, g00_re, - _mm256_mul_pd(g00_im, g00_im))); - g00_re = _mm256_mul_pd(g00_re, t); - g00_im = _mm256_mul_pd(g00_im, t); - mu_re = FMADD(g01_re, g00_re, - _mm256_mul_pd(g01_im, g00_im)); - mu_im = FMSUB(g01_re, g00_im, - _mm256_mul_pd(g01_im, g00_re)); - xi_re = FMSUB(mu_re, g01_re, - _mm256_mul_pd(mu_im, g01_im)); - xi_im = FMADD(mu_im, g01_re, - _mm256_mul_pd(mu_re, g01_im)); - _mm256_storeu_pd(&g11[u].v, - _mm256_sub_pd(g11_re, xi_re)); - _mm256_storeu_pd(&g11[u + hn].v, - _mm256_add_pd(g11_im, xi_im)); - _mm256_storeu_pd(&g01[u].v, mu_re); - _mm256_storeu_pd(&g01[u + hn].v, mu_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, - mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(g11[u], g11[u + hn], - g11_re, g11_im, g01_re, g01_im); - g01[u] = mu_re; - g01[u + hn] = fpr_neg(mu_im); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; fpr mu_re, mu_im; @@ -1110,11 +601,9 @@ Zf(poly_LDL_fft)( g01[u] = mu_re; g01[u + hn] = fpr_neg(mu_im); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_LDLmv_fft)( fpr *restrict d11, fpr *restrict l10, @@ -1125,63 +614,6 @@ Zf(poly_LDLmv_fft)( n = (size_t)1 << logn; hn = n >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d one; - - one = _mm256_set1_pd(1.0); - for (u = 0; u < hn; u += 4) { - __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - __m256d t, mu_re, mu_im, xi_re, xi_im; - - g00_re = _mm256_loadu_pd(&g00[u].v); - g00_im = _mm256_loadu_pd(&g00[u + hn].v); - g01_re = _mm256_loadu_pd(&g01[u].v); - g01_im = _mm256_loadu_pd(&g01[u + hn].v); - g11_re = _mm256_loadu_pd(&g11[u].v); - g11_im = _mm256_loadu_pd(&g11[u + hn].v); - - t = _mm256_div_pd(one, - FMADD(g00_re, g00_re, - _mm256_mul_pd(g00_im, g00_im))); - g00_re = _mm256_mul_pd(g00_re, t); - g00_im = _mm256_mul_pd(g00_im, t); - mu_re = FMADD(g01_re, g00_re, - _mm256_mul_pd(g01_im, g00_im)); - mu_im = FMSUB(g01_re, g00_im, - _mm256_mul_pd(g01_im, g00_re)); - xi_re = FMSUB(mu_re, g01_re, - _mm256_mul_pd(mu_im, g01_im)); - xi_im = FMADD(mu_im, g01_re, - _mm256_mul_pd(mu_re, g01_im)); - _mm256_storeu_pd(&d11[u].v, - _mm256_sub_pd(g11_re, xi_re)); - _mm256_storeu_pd(&d11[u + hn].v, - _mm256_add_pd(g11_im, xi_im)); - _mm256_storeu_pd(&l10[u].v, mu_re); - _mm256_storeu_pd(&l10[u + hn].v, mu_im); - } - } else { - for (u = 0; u < hn; u ++) { - fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; - fpr mu_re, mu_im; - - g00_re = g00[u]; - g00_im = g00[u + hn]; - g01_re = g01[u]; - g01_im = g01[u + hn]; - g11_re = g11[u]; - g11_im = g11[u + hn]; - FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im); - FPC_MUL(g01_re, g01_im, - mu_re, mu_im, g01_re, fpr_neg(g01_im)); - FPC_SUB(d11[u], d11[u + hn], - g11_re, g11_im, g01_re, g01_im); - l10[u] = mu_re; - l10[u + hn] = fpr_neg(mu_im); - } - } -#else // yyyAVX2+0 for (u = 0; u < hn; u ++) { fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im; fpr mu_re, mu_im; @@ -1198,11 +630,9 @@ Zf(poly_LDLmv_fft)( l10[u] = mu_re; l10[u + hn] = fpr_neg(mu_im); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_split_fft)( fpr *restrict f0, fpr *restrict f1, @@ -1220,62 +650,6 @@ Zf(poly_split_fft)( hn = n >> 1; qn = hn >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 8) { - __m256d half, sv; - - half = _mm256_set1_pd(0.5); - sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0); - for (u = 0; u < qn; u += 2) { - __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt; - - ab_re = _mm256_loadu_pd(&f[(u << 1)].v); - ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v); - ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half); - ff0 = _mm256_permute4x64_pd(ff0, 0xD8); - _mm_storeu_pd(&f0[u].v, - _mm256_extractf128_pd(ff0, 0)); - _mm_storeu_pd(&f0[u + qn].v, - _mm256_extractf128_pd(ff0, 1)); - - ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half); - gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v); - ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5); - ff3 = _mm256_hadd_pd( - _mm256_mul_pd(ff1, gmt), - _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv)); - ff3 = _mm256_permute4x64_pd(ff3, 0xD8); - _mm_storeu_pd(&f1[u].v, - _mm256_extractf128_pd(ff3, 0)); - _mm_storeu_pd(&f1[u + qn].v, - _mm256_extractf128_pd(ff3, 1)); - } - } else { - f0[0] = f[0]; - f1[0] = f[hn]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f[(u << 1) + 0]; - a_im = f[(u << 1) + 0 + hn]; - b_re = f[(u << 1) + 1]; - b_im = f[(u << 1) + 1 + hn]; - - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f0[u] = fpr_half(t_re); - f0[u + qn] = fpr_half(t_im); - - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - FPC_MUL(t_re, t_im, t_re, t_im, - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1])); - f1[u] = fpr_half(t_re); - f1[u + qn] = fpr_half(t_im); - } - } -#else // yyyAVX2+0 /* * We process complex values by pairs. For logn = 1, there is only * one complex value (the other one is the implicit conjugate), @@ -1305,11 +679,9 @@ Zf(poly_split_fft)( f1[u] = fpr_half(t_re); f1[u + qn] = fpr_half(t_im); } -#endif // yyyAVX2- } /* see inner.h */ -TARGET_AVX2 void Zf(poly_merge_fft)( fpr *restrict f, @@ -1321,71 +693,6 @@ Zf(poly_merge_fft)( hn = n >> 1; qn = hn >> 1; -#if FALCON_AVX2 // yyyAVX2+1 - if (n >= 16) { - for (u = 0; u < qn; u += 4) { - __m256d a_re, a_im, b_re, b_im, c_re, c_im; - __m256d gm1, gm2, g_re, g_im; - __m256d t_re, t_im, u_re, u_im; - __m256d tu1_re, tu2_re, tu1_im, tu2_im; - - a_re = _mm256_loadu_pd(&f0[u].v); - a_im = _mm256_loadu_pd(&f0[u + qn].v); - c_re = _mm256_loadu_pd(&f1[u].v); - c_im = _mm256_loadu_pd(&f1[u + qn].v); - - gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v); - gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v); - g_re = _mm256_unpacklo_pd(gm1, gm2); - g_im = _mm256_unpackhi_pd(gm1, gm2); - g_re = _mm256_permute4x64_pd(g_re, 0xD8); - g_im = _mm256_permute4x64_pd(g_im, 0xD8); - - b_re = FMSUB( - c_re, g_re, _mm256_mul_pd(c_im, g_im)); - b_im = FMADD( - c_re, g_im, _mm256_mul_pd(c_im, g_re)); - - t_re = _mm256_add_pd(a_re, b_re); - t_im = _mm256_add_pd(a_im, b_im); - u_re = _mm256_sub_pd(a_re, b_re); - u_im = _mm256_sub_pd(a_im, b_im); - - tu1_re = _mm256_unpacklo_pd(t_re, u_re); - tu2_re = _mm256_unpackhi_pd(t_re, u_re); - tu1_im = _mm256_unpacklo_pd(t_im, u_im); - tu2_im = _mm256_unpackhi_pd(t_im, u_im); - _mm256_storeu_pd(&f[(u << 1)].v, - _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20)); - _mm256_storeu_pd(&f[(u << 1) + 4].v, - _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31)); - _mm256_storeu_pd(&f[(u << 1) + hn].v, - _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20)); - _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v, - _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31)); - } - } else { - f[0] = f0[0]; - f[hn] = f1[0]; - - for (u = 0; u < qn; u ++) { - fpr a_re, a_im, b_re, b_im; - fpr t_re, t_im; - - a_re = f0[u]; - a_im = f0[u + qn]; - FPC_MUL(b_re, b_im, f1[u], f1[u + qn], - fpr_gm_tab[((u + hn) << 1) + 0], - fpr_gm_tab[((u + hn) << 1) + 1]); - FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 0] = t_re; - f[(u << 1) + 0 + hn] = t_im; - FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); - f[(u << 1) + 1] = t_re; - f[(u << 1) + 1 + hn] = t_im; - } - } -#else // yyyAVX2+0 /* * An extra copy to handle the special case logn = 1. */ @@ -1408,5 +715,4 @@ Zf(poly_merge_fft)( f[(u << 1) + 1] = t_re; f[(u << 1) + 1 + hn] = t_im; } -#endif // yyyAVX2- } diff --git a/fpr.c b/fpr.c index eb23a44..c6fbc1c 100644 --- a/fpr.c +++ b/fpr.c @@ -34,8 +34,6 @@ #include "inner.h" -#if FALCON_FPEMU // yyyFPEMU+1 - /* * Normalize a provided unsigned integer to the 2^63..2^64-1 range by * left-shifting it if necessary. The exponent e is adjusted accordingly @@ -78,100 +76,6 @@ (e) += (int)(nt); \ } while (0) -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_scaled(int64_t i __attribute__((unused)), int sc __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, lr }\n\t" - "\n\t" - "@ Input i is in r0:r1, and sc in r2.\n\t" - "@ Extract the sign bit, and compute the absolute value.\n\t" - "@ -> sign bit in r3, with value 0 or -1\n\t" - "asrs r3, r1, #31\n\t" - "eors r0, r3\n\t" - "eors r1, r3\n\t" - "subs r0, r3\n\t" - "sbcs r1, r3\n\t" - "\n\t" - "@ Scale exponent to account for the encoding; if the source is\n\t" - "@ zero or if the scaled exponent is negative, it is set to 32.\n\t" - "addw r2, r2, #1022\n\t" - "orrs r4, r0, r1\n\t" - "bics r4, r4, r2, asr #31\n\t" - "rsbs r5, r4, #0\n\t" - "orrs r4, r5\n\t" - "ands r2, r2, r4, asr #31\n\t" - "adds r2, #32\n\t" - "\n\t" - "@ Normalize value to a full 64-bit width, by shifting it left.\n\t" - "@ The shift count is subtracted from the exponent (in r2).\n\t" - "@ If the mantissa is 0, the exponent is set to 0.\n\t" - "\n\t" - "@ If top word is 0, replace with low word; otherwise, add 32 to\n\t" - "@ the exponent.\n\t" - "rsbs r4, r1, #0\n\t" - "orrs r4, r1\n\t" - "eors r5, r0, r1\n\t" - "bics r5, r5, r4, asr #31\n\t" - "eors r1, r5\n\t" - "ands r0, r0, r4, asr #31\n\t" - "lsrs r4, r4, #31\n\t" - "adds r2, r2, r4, lsl #5\n\t" - "\n\t" - "@ Count leading zeros of r1 to finish the shift.\n\t" - "clz r4, r1\n\t" - "subs r2, r4\n\t" - "rsbs r5, r4, #32\n\t" - "lsls r1, r4\n\t" - "lsrs r5, r0, r5\n\t" - "lsls r0, r4\n\t" - "orrs r1, r5\n\t" - "\n\t" - "@ Clear the top bit; we know it's a 1 (unless the whole mantissa\n\t" - "@ was zero, but then it's still OK to clear it)\n\t" - "bfc r1, #31, #1\n\t" - "\n\t" - "@ Now shift right the value by 11 bits; this puts the value in\n\t" - "@ the 2^52..2^53-1 range. We also keep a copy of the pre-shift\n\t" - "@ low bits in r5.\n\t" - "movs r5, r0\n\t" - "lsrs r0, #11\n\t" - "orrs r0, r0, r1, lsl #21\n\t" - "lsrs r1, #11\n\t" - "\n\t" - "@ Also plug the exponent at the right place. This must be done\n\t" - "@ now so that, in case the rounding creates a carry, that carry\n\t" - "@ adds to the exponent, which would be exactly what we want at\n\t" - "@ that point.\n\t" - "orrs r1, r1, r2, lsl #20\n\t" - "\n\t" - "@ Rounding: we must add 1 to the mantissa in the following cases:\n\t" - "@ - bits 11 to 9 of r5 are '011', '110' or '111'\n\t" - "@ - bits 11 to 9 of r5 are '010' and one of the\n\t" - "@ bits 0 to 8 is non-zero\n\t" - "ubfx r6, r5, #0, #9\n\t" - "addw r6, r6, #511\n\t" - "orrs r5, r6\n\t" - "\n\t" - "ubfx r5, r5, #9, #3\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r5\n\t" - "ands r6, #1\n\t" - "adds r0, r6\n\t" - "adcs r1, #0\n\t" - "\n\t" - "@ Put back the sign.\n\t" - "orrs r1, r1, r3, lsl #31\n\t" - "\n\t" - "pop { r4, r5, r6, pc}\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - fpr fpr_scaled(int64_t i, int sc) { @@ -228,248 +132,6 @@ fpr_scaled(int64_t i, int sc) return FPR(s, e, m); } -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -// yyyPQCLEAN+0 -#if 0 -/* Debug code -- To get a printout of registers from a specific point - in ARM Cortex M4 assembly code, uncomment this code and add a - "bl DEBUG" call where wished for. */ - -void -print_regs(uint32_t *rr, uint32_t flags) -{ - int i; - extern int printf(const char *fmt, ...); - - printf("\nRegs:\n"); - for (i = 0; i < 7; i ++) { - int j; - - j = i + 7; - printf(" %2d = %08X %2d = %08X\n", i, rr[i], j, rr[j]); - } - printf(" flags = %08X ", flags); - if ((flags >> 31) & 1) { - printf("N"); - } - if ((flags >> 30) & 1) { - printf("Z"); - } - if ((flags >> 29) & 1) { - printf("C"); - } - if ((flags >> 28) & 1) { - printf("V"); - } - if ((flags >> 27) & 1) { - printf("Q"); - } - printf("\n"); -} - -__attribute__((naked)) -void -DEBUG(void) -{ - __asm__ ( - "push { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr }\n\t" - "mov r0, sp\n\t" - "mrs r1, apsr\n\t" - "bl print_regs\n\t" - "pop { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc }\n\t" - ); -} -#endif -// yyyPQCLEAN- - -__attribute__((naked)) -fpr -fpr_add(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Make sure that the first operand (x) has the larger absolute\n\t" - "@ value. This guarantees that the exponent of y is less than\n\t" - "@ or equal to the exponent of x, and, if they are equal, then\n\t" - "@ the mantissa of y will not be greater than the mantissa of x.\n\t" - "@ However, if absolute values are equal and the sign of x is 1,\n\t" - "@ then we want to also swap the values.\n\t" - "ubfx r4, r1, #0, #31 @ top word without sign bit\n\t" - "ubfx r5, r3, #0, #31 @ top word without sign bit\n\t" - "subs r7, r0, r2 @ difference in r7:r4\n\t" - "sbcs r4, r5\n\t" - "orrs r7, r4\n\t" - "rsbs r5, r7, #0\n\t" - "orrs r7, r5 @ bit 31 of r7 is 0 iff difference is zero\n\t" - "bics r6, r1, r7\n\t" - "orrs r6, r4 @ bit 31 of r6 is 1 iff the swap must be done\n\t" - "\n\t" - "@ Conditional swap\n\t" - "eors r4, r0, r2\n\t" - "eors r5, r1, r3\n\t" - "ands r4, r4, r6, asr #31\n\t" - "ands r5, r5, r6, asr #31\n\t" - "eors r0, r4\n\t" - "eors r1, r5\n\t" - "eors r2, r4\n\t" - "eors r3, r5\n\t" - "\n\t" - "@ Extract mantissa of x into r0:r1, exponent in r4, sign in r5\n\t" - "ubfx r4, r1, #20, #11 @ Exponent in r4 (without sign)\n\t" - "addw r5, r4, #2047 @ Get a carry to test r4 for zero\n\t" - "lsrs r5, #11 @ r5 is the mantissa implicit high bit\n\t" - "bfc r1, #20, #11 @ Clear exponent bits (not the sign)\n\t" - "orrs r1, r1, r5, lsl #20 @ Set mantissa high bit\n\t" - "asrs r5, r1, #31 @ Get sign bit (sign-extended)\n\t" - "bfc r1, #31, #1 @ Clear the sign bit\n\t" - "\n\t" - "@ Extract mantissa of y into r2:r3, exponent in r6, sign in r7\n\t" - "ubfx r6, r3, #20, #11 @ Exponent in r6 (without sign)\n\t" - "addw r7, r6, #2047 @ Get a carry to test r6 for zero\n\t" - "lsrs r7, #11 @ r7 is the mantissa implicit high bit\n\t" - "bfc r3, #20, #11 @ Clear exponent bits (not the sign)\n\t" - "orrs r3, r3, r7, lsl #20 @ Set mantissa high bit\n\t" - "asrs r7, r3, #31 @ Get sign bit (sign-extended)\n\t" - "bfc r3, #31, #1 @ Clear the sign bit\n\t" - "\n\t" - "@ Scale mantissas up by three bits.\n\t" - "lsls r1, #3\n\t" - "orrs r1, r1, r0, lsr #29\n\t" - "lsls r0, #3\n\t" - "lsls r3, #3\n\t" - "orrs r3, r3, r2, lsr #29\n\t" - "lsls r2, #3\n\t" - "\n\t" - "@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "@ y: exponent=r6, sign=r7, mantissa=r2:r3 (scaled up 3 bits)\n\t" - "\n\t" - "@ At that point, the exponent of x (in r4) is larger than that\n\t" - "@ of y (in r6). The difference is the amount of shifting that\n\t" - "@ should be done on y. If that amount is larger than 59 then\n\t" - "@ we clamp y to 0. We won't need y's exponent beyond that point,\n\t" - "@ so we store that shift count in r6.\n\t" - "subs r6, r4, r6\n\t" - "subs r8, r6, #60\n\t" - "ands r2, r2, r8, asr #31\n\t" - "ands r3, r3, r8, asr #31\n\t" - "\n\t" - "@ Shift right r2:r3 by r6 bits. The shift count is in the 0..59\n\t" - "@ range. r11 will be non-zero if and only if some non-zero bits\n\t" - "@ were dropped.\n\t" - "subs r8, r6, #32\n\t" - "bics r11, r2, r8, asr #31\n\t" - "ands r2, r2, r8, asr #31\n\t" - "bics r10, r3, r8, asr #31\n\t" - "orrs r2, r2, r10\n\t" - "ands r3, r3, r8, asr #31\n\t" - "ands r6, r6, #31\n\t" - "rsbs r8, r6, #32\n\t" - "lsls r10, r2, r8\n\t" - "orrs r11, r11, r10\n\t" - "lsrs r2, r2, r6\n\t" - "lsls r10, r3, r8\n\t" - "orrs r2, r2, r10\n\t" - "lsrs r3, r3, r6\n\t" - "\n\t" - "@ If r11 is non-zero then some non-zero bit was dropped and the\n\t" - "@ low bit of r2 must be forced to 1 ('sticky bit').\n\t" - "rsbs r6, r11, #0\n\t" - "orrs r6, r6, r11\n\t" - "orrs r2, r2, r6, lsr #31\n\t" - "\n\t" - "@ x: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "@ y: sign=r7, value=r2:r3 (scaled to same exponent as x)\n\t" - "\n\t" - "@ If x and y don't have the same sign, then we should negate r2:r3\n\t" - "@ (i.e. subtract the mantissa instead of adding it). Signs of x\n\t" - "@ and y are in r5 and r7, as full-width words. We won't need r7\n\t" - "@ afterwards.\n\t" - "eors r7, r5 @ r7 = -1 if y must be negated, 0 otherwise\n\t" - "eors r2, r7\n\t" - "eors r3, r7\n\t" - "subs r2, r7\n\t" - "sbcs r3, r7\n\t" - "\n\t" - "@ r2:r3 has been shifted, we can add to r0:r1.\n\t" - "adds r0, r2\n\t" - "adcs r1, r3\n\t" - "\n\t" - "@ result: exponent=r4, sign=r5, mantissa=r0:r1 (scaled up 3 bits)\n\t" - "\n\t" - "@ Normalize the result with some left-shifting to full 64-bit\n\t" - "@ width. Shift count goes to r2, and exponent (r4) is adjusted.\n\t" - "clz r2, r0\n\t" - "clz r3, r1\n\t" - "sbfx r6, r3, #5, #1\n\t" - "ands r2, r6\n\t" - "adds r2, r2, r3\n\t" - "subs r4, r4, r2\n\t" - "\n\t" - "@ Shift r0:r1 to the left by r2 bits.\n\t" - "subs r7, r2, #32\n\t" - "lsls r7, r0, r7\n\t" - "lsls r1, r1, r2\n\t" - "rsbs r6, r2, #32\n\t" - "orrs r1, r1, r7\n\t" - "lsrs r6, r0, r6\n\t" - "orrs r1, r1, r6\n\t" - "lsls r0, r0, r2\n\t" - "\n\t" - "@ The exponent of x was in r4. The left-shift operation has\n\t" - "@ subtracted some value from it, 8 in case the result has the\n\t" - "@ same exponent as x. However, the high bit of the mantissa will\n\t" - "@ add 1 to the exponent, so we only add back 7 (the exponent is\n\t" - "@ added in because rounding might have produced a carry, which\n\t" - "@ should then spill into the exponent).\n\t" - "adds r4, #7\n\t" - "\n\t" - "@ If the mantissa new mantissa is non-zero, then its bit 63 is\n\t" - "@ non-zero (thanks to the normalizing shift). Otherwise, that bit\n\t" - "@ is zero, and we should then set the exponent to zero as well.\n\t" - "ands r4, r4, r1, asr #31\n\t" - "\n\t" - "@ Shrink back the value to a 52-bit mantissa. This requires\n\t" - "@ right-shifting by 11 bits; we keep a copy of the pre-shift\n\t" - "@ low word in r3.\n\t" - "movs r3, r0\n\t" - "lsrs r0, #11\n\t" - "orrs r0, r0, r1, lsl #21\n\t" - "lsrs r1, #11\n\t" - "\n\t" - "@ Apply rounding.\n\t" - "ubfx r6, r3, #0, #9\n\t" - "addw r6, r6, #511\n\t" - "orrs r3, r6\n\t" - "ubfx r3, r3, #9, #3\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r3\n\t" - "ands r6, #1\n\t" - "adds r0, r6\n\t" - "adcs r1, #0\n\t" - "\n\t" - "@Plug in the exponent with an addition.\n\t" - "adds r1, r1, r4, lsl #20\n\t" - "\n\t" - "@ If the new exponent is negative or zero, then it underflowed\n\t" - "@ and we must clear the whole mantissa and exponent.\n\t" - "rsbs r4, r4, #0\n\t" - "ands r0, r0, r4, asr #31\n\t" - "ands r1, r1, r4, asr #31\n\t" - "\n\t" - "@ Put back the sign. This is the sign of x: thanks to the\n\t" - "@ conditional swap at the start, this is always correct.\n\t" - "bfi r1, r5, #31, #1\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - fpr fpr_add(fpr x, fpr y) { @@ -578,130 +240,6 @@ fpr_add(fpr x, fpr y) return FPR(sx, ex, xu); } -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_mul(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Extract mantissas: x.m = r4:r5, y.m = r6:r7\n\t" - "@ r4 and r6 contain only 25 bits each.\n\t" - "bics r4, r0, #0xFE000000\n\t" - "lsls r5, r1, #7\n\t" - "orrs r5, r5, r0, lsr #25\n\t" - "orrs r5, r5, #0x08000000\n\t" - "bics r5, r5, #0xF0000000\n\t" - "bics r6, r2, #0xFE000000\n\t" - "lsls r7, r3, #7\n\t" - "orrs r7, r7, r2, lsr #25\n\t" - "orrs r7, r7, #0x08000000\n\t" - "bics r7, r7, #0xF0000000\n\t" - "\n\t" - "@ Perform product. Values are in the 2^52..2^53-1 range, so\n\t" - "@ the product is at most 106-bit long. Of the low 50 bits,\n\t" - "@ we only want to know if they are all zeros or not. Here,\n\t" - "@ we get the top 56 bits in r10:r11, and r8 will be non-zero\n\t" - "@ if and only if at least one of the low 50 bits is non-zero.\n\t" - "umull r8, r10, r4, r6 @ x0*y0\n\t" - "lsls r10, #7\n\t" - "orrs r10, r10, r8, lsr #25\n\t" - "eors r11, r11\n\t" - "umlal r10, r11, r4, r7 @ x0*y1\n\t" - "umlal r10, r11, r5, r6 @ x1*y0\n\t" - "orrs r8, r8, r10, lsl #7\n\t" - "lsrs r10, #25\n\t" - "orrs r10, r10, r11, lsl #7\n\t" - "eors r11, r11\n\t" - "umlal r10, r11, r5, r7 @ x1*y1\n\t" - "\n\t" - "@ Now r0, r2, r4, r5, r6 and r7 are free.\n\t" - "@ If any of the low 50 bits was non-zero, then we force the\n\t" - "@ low bit of r10 to 1.\n\t" - "rsbs r4, r8, #0\n\t" - "orrs r8, r8, r4\n\t" - "orrs r10, r10, r8, lsr #31\n\t" - "\n\t" - "@ r8 is free.\n\t" - "@ r10:r11 contains the product in the 2^54..2^56-1 range. We\n\t" - "@ normalize it to 2^54..2^55-1 (into r6:r7) with a conditional\n\t" - "@ shift (low bit is sticky). r5 contains -1 if the shift was done,\n\t" - "@ 0 otherwise.\n\t" - "ands r6, r10, #1\n\t" - "lsrs r5, r11, #23\n\t" - "rsbs r5, r5, #0\n\t" - "orrs r6, r6, r10, lsr #1\n\t" - "orrs r6, r6, r11, lsl #31\n\t" - "lsrs r7, r11, #1\n\t" - "eors r10, r10, r6\n\t" - "eors r11, r11, r7\n\t" - "bics r10, r10, r5\n\t" - "bics r11, r11, r5\n\t" - "eors r6, r6, r10\n\t" - "eors r7, r7, r11\n\t" - "\n\t" - "@ Compute aggregate exponent: ex + ey - 1023 + w\n\t" - "@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t" - "@ But we subtract 1 because the injection of the mantissa high\n\t" - "@ bit will increment the exponent by 1.\n\t" - "lsls r0, r1, #1\n\t" - "lsls r2, r3, #1\n\t" - "lsrs r0, #21\n\t" - "addw r4, r0, #0x7FF @ save ex + 2047 in r4\n\t" - "lsrs r2, #21\n\t" - "addw r8, r2, #0x7FF @ save ey + 2047 in r8\n\t" - "adds r2, r0\n\t" - "subw r2, r2, #1024\n\t" - "subs r2, r5\n\t" - "\n\t" - "@ r5 is free.\n\t" - "@ Also, if either of the source exponents is 0, or the result\n\t" - "@ exponent is 0 or negative, then the result is zero and the\n\t" - "@ mantissa and the exponent shall be clamped to zero. Since\n\t" - "@ r2 contains the result exponent minus 1, we test on r2\n\t" - "@ being strictly negative.\n\t" - "ands r4, r8 @ if bit 11 = 0 then one of the exponents was 0\n\t" - "mvns r5, r2\n\t" - "ands r5, r5, r4, lsl #20\n\t" - "ands r2, r2, r5, asr #31\n\t" - "ands r6, r6, r5, asr #31\n\t" - "ands r7, r7, r5, asr #31\n\t" - "\n\t" - "@ Sign is the XOR of the sign of the operands. This is true in\n\t" - "@ all cases, including very small results (exponent underflow)\n\t" - "@ and zeros.\n\t" - "eors r1, r3\n\t" - "bfc r1, #0, #31\n\t" - "\n\t" - "@ Plug in the exponent.\n\t" - "bfi r1, r2, #20, #11\n\t" - "\n\t" - "@ r2 and r3 are free.\n\t" - "@ Shift back to the normal 53-bit mantissa, with rounding.\n\t" - "@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t" - "@ because the rounding may have triggered a carry, that should\n\t" - "@ be added to the exponent.\n\t" - "movs r4, r6\n\t" - "lsrs r0, r6, #2\n\t" - "orrs r0, r0, r7, lsl #30\n\t" - "adds r1, r1, r7, lsr #2\n\t" - "ands r4, #0x7\n\t" - "movs r3, #0xC8\n\t" - "lsrs r3, r4\n\t" - "ands r3, #1\n\t" - "adds r0, r3\n\t" - "adcs r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - fpr fpr_mul(fpr x, fpr y) { @@ -798,145 +336,6 @@ fpr_mul(fpr x, fpr y) return FPR(s, e, zu); } -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_div(fpr x __attribute__((unused)), fpr y __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - - "@ Extract mantissas of x and y, in r0:r4 and r2:r5, respectively.\n\t" - "@ We don't touch r1 and r3 as they contain the exponents and\n\t" - "@ signs, which we'll need later on.\n\t" - "ubfx r4, r1, #0, #20\n\t" - "ubfx r5, r3, #0, #20\n\t" - "orrs r4, r4, #0x00100000\n\t" - "orrs r5, r5, #0x00100000\n\t" - "\n\t" - "@ Perform bit-by-bit division. We want a 56-bit result in r8:r10\n\t" - "@ (low bit is 0). Bits come from the carry flag and are\n\t" - "@ injected with rrx, i.e. in position 31; we thus get bits in\n\t" - "@ the reverse order. Bits accumulate in r8; after the first 24\n\t" - "@ bits, we move the quotient bits to r10.\n\t" - "eors r8, r8\n\t" - "\n\t" - -#define DIVSTEP \ - "subs r6, r0, r2\n\t" \ - "sbcs r7, r4, r5\n\t" \ - "rrx r8, r8\n\t" \ - "ands r6, r2, r8, asr #31\n\t" \ - "ands r7, r5, r8, asr #31\n\t" \ - "subs r0, r6\n\t" \ - "sbcs r4, r7\n\t" \ - "adds r0, r0, r0\n\t" \ - "adcs r4, r4, r4\n\t" - -#define DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP DIVSTEP -#define DIVSTEP8 DIVSTEP4 DIVSTEP4 - - DIVSTEP8 - DIVSTEP8 - DIVSTEP8 - - "\n\t" - "@ We have the first 24 bits of the quotient, move them to r10.\n\t" - "rbit r10, r8\n\t" - "\n\t" - - DIVSTEP8 - DIVSTEP8 - DIVSTEP8 - DIVSTEP4 DIVSTEP DIVSTEP DIVSTEP - -#undef DIVSTEP -#undef DIVSTEP4 -#undef DIVSTEP8 - - "\n\t" - "@ Lowest bit will be set if remainder is non-zero at this point\n\t" - "@ (this is the 'sticky' bit).\n\t" - "subs r0, #1\n\t" - "sbcs r4, #0\n\t" - "rrx r8, r8\n\t" - "\n\t" - "@ We now have the next (low) 32 bits of the quotient.\n\t" - "rbit r8, r8\n\t" - "\n\t" - "@ Since both operands had their top bit set, we know that the\n\t" - "@ result at this point is in 2^54..2^56-1. We scale it down\n\t" - "@ to 2^54..2^55-1 with a conditional shift. We also write the\n\t" - "@ result in r4:r5. If the shift is done, r6 will contain -1.\n\t" - "ands r4, r8, #1\n\t" - "lsrs r6, r10, #23\n\t" - "rsbs r6, r6, #0\n\t" - "orrs r4, r4, r8, lsr #1\n\t" - "orrs r4, r4, r10, lsl #31\n\t" - "lsrs r5, r10, #1\n\t" - "eors r8, r8, r4\n\t" - "eors r10, r10, r5\n\t" - "bics r8, r8, r6\n\t" - "bics r10, r10, r6\n\t" - "eors r4, r4, r8\n\t" - "eors r5, r5, r10\n\t" - "\n\t" - "@ Compute aggregate exponent: ex - ey + 1022 + w\n\t" - "@ (where w = 1 if the conditional shift was done, 0 otherwise)\n\t" - "@ But we subtract 1 because the injection of the mantissa high\n\t" - "@ bit will increment the exponent by 1.\n\t" - "lsls r0, r1, #1\n\t" - "lsls r2, r3, #1\n\t" - "lsrs r0, r0, #21\n\t" - "addw r7, r0, #0x7FF @ save ex + 2047 in r7\n\t" - "subs r0, r0, r2, lsr #21\n\t" - "addw r0, r0, #1021\n\t" - "subs r0, r6\n\t" - "\n\t" - "@ If the x operand was zero, then the computation was wrong and\n\t" - "@ the result is zero. Also, if the result exponent is zero or\n\t" - "@ negative, then the mantissa shall be clamped to zero. Since r0\n\t" - "@ contains the result exponent minus 1, we test on r0 being\n\t" - "@ strictly negative.\n\t" - "mvns r2, r0\n\t" - "ands r2, r2, r7, lsl #20\n\t" - "ands r0, r0, r2, asr #31\n\t" - "ands r4, r4, r2, asr #31\n\t" - "ands r5, r5, r2, asr #31\n\t" - "\n\t" - "@ Sign is the XOR of the sign of the operands. This is true in\n\t" - "@ all cases, including very small results (exponent underflow)\n\t" - "@ and zeros.\n\t" - "eors r1, r3\n\t" - "bfc r1, #0, #31\n\t" - "\n\t" - "@ Plug in the exponent.\n\t" - "bfi r1, r0, #20, #11\n\t" - "\n\t" - "@ Shift back to the normal 53-bit mantissa, with rounding.\n\t" - "@ Mantissa goes into r0:r1. For r1, we must use an addition\n\t" - "@ because the rounding may have triggered a carry, that should\n\t" - "@ be added to the exponent.\n\t" - "movs r6, r4\n\t" - "lsrs r0, r4, #2\n\t" - "orrs r0, r0, r5, lsl #30\n\t" - "adds r1, r1, r5, lsr #2\n\t" - "ands r6, #0x7\n\t" - "movs r3, #0xC8\n\t" - "lsrs r3, r6\n\t" - "ands r3, #1\n\t" - "adds r0, r3\n\t" - "adcs r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - fpr fpr_div(fpr x, fpr y) { @@ -1024,186 +423,6 @@ fpr_div(fpr x, fpr y) return FPR(s, e, q); } -#endif // yyyASM_CORTEXM4- - -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -fpr -fpr_sqrt(fpr x __attribute__((unused))) -{ - __asm__ ( - "push { r4, r5, r6, r7, r8, r10, r11, lr }\n\t" - "\n\t" - "@ Extract mantissa (r0:r1) and exponent (r2). We assume that the\n\t" - "@ sign is positive. If the source is zero, then the mantissa is\n\t" - "@ set to 0.\n\t" - "lsrs r2, r1, #20\n\t" - "bfc r1, #20, #12\n\t" - "addw r3, r2, #0x7FF\n\t" - "subw r2, r2, #1023\n\t" - "lsrs r3, r3, #11\n\t" - "orrs r1, r1, r3, lsl #20\n\t" - "\n\t" - "@ If the exponent is odd, then multiply mantissa by 2 and subtract\n\t" - "@ 1 from the exponent.\n\t" - "ands r3, r2, #1\n\t" - "subs r2, r2, r3\n\t" - "rsbs r3, r3, #0\n\t" - "ands r4, r1, r3\n\t" - "ands r3, r0\n\t" - "adds r0, r3\n\t" - "adcs r1, r4\n\t" - "\n\t" - "@ Left-shift the mantissa by 9 bits to put it in the\n\t" - "@ 2^61..2^63-1 range (unless it is exactly 0).\n\t" - "lsls r1, r1, #9\n\t" - "orrs r1, r1, r0, lsr #23\n\t" - "lsls r0, r0, #9\n\t" - "\n\t" - "@ Compute the square root bit-by-bit.\n\t" - "@ There are 54 iterations; first 30 can work on top word only.\n\t" - "@ q = r3 (bit-reversed)\n\t" - "@ s = r5\n\t" - "eors r3, r3\n\t" - "eors r5, r5\n\t" - -#define SQRT_STEP_HI(bit) \ - "orrs r6, r5, #(1 << (" #bit "))\n\t" \ - "subs r7, r1, r6\n\t" \ - "rrx r3, r3\n\t" \ - "ands r6, r6, r3, asr #31\n\t" \ - "subs r1, r1, r6\n\t" \ - "lsrs r6, r3, #31\n\t" \ - "orrs r5, r5, r6, lsl #((" #bit ") + 1)\n\t" \ - "adds r0, r0\n\t" \ - "adcs r1, r1\n\t" - -#define SQRT_STEP_HIx5(b) \ - SQRT_STEP_HI((b)+4) \ - SQRT_STEP_HI((b)+3) \ - SQRT_STEP_HI((b)+2) \ - SQRT_STEP_HI((b)+1) \ - SQRT_STEP_HI(b) - - SQRT_STEP_HIx5(25) - SQRT_STEP_HIx5(20) - SQRT_STEP_HIx5(15) - SQRT_STEP_HIx5(10) - SQRT_STEP_HIx5(5) - SQRT_STEP_HIx5(0) - -#undef SQRT_STEP_HI -#undef SQRT_STEP_HIx5 - - "@ Top 30 bits of the result must be reversed: they were\n\t" - "@ accumulated with rrx (hence from the top bit).\n\t" - "rbit r3, r3\n\t" - "\n\t" - "@ For the next 24 iterations, we must use two-word operations.\n\t" - "@ bits of q now accumulate in r4\n\t" - "@ s is in r6:r5\n\t" - "eors r4, r4\n\t" - "eors r6, r6\n\t" - "\n\t" - "@ First iteration is special because the potential bit goes into\n\t" - "@ r5, not r6.\n\t" - "orrs r7, r6, #(1 << 31)\n\t" - "subs r8, r0, r7\n\t" - "sbcs r10, r1, r5\n\t" - "rrx r4, r4\n\t" - "ands r7, r7, r4, asr #31\n\t" - "ands r8, r5, r4, asr #31\n\t" - "subs r0, r0, r7\n\t" - "sbcs r1, r1, r8\n\t" - "lsrs r7, r4, #31\n\t" - "orrs r5, r5, r4, lsr #31\n\t" - "adds r0, r0\n\t" - "adcs r1, r1\n\t" - -#define SQRT_STEP_LO(bit) \ - "orrs r7, r6, #(1 << (" #bit "))\n\t" \ - "subs r8, r0, r7\n\t" \ - "sbcs r10, r1, r5\n\t" \ - "rrx r4, r4\n\t" \ - "ands r7, r7, r4, asr #31\n\t" \ - "ands r8, r5, r4, asr #31\n\t" \ - "subs r0, r0, r7\n\t" \ - "sbcs r1, r1, r8\n\t" \ - "lsrs r7, r4, #31\n\t" \ - "orrs r6, r6, r7, lsl #((" #bit ") + 1)\n\t" \ - "adds r0, r0\n\t" \ - "adcs r1, r1\n\t" - -#define SQRT_STEP_LOx4(b) \ - SQRT_STEP_LO((b)+3) \ - SQRT_STEP_LO((b)+2) \ - SQRT_STEP_LO((b)+1) \ - SQRT_STEP_LO(b) - - SQRT_STEP_LO(30) - SQRT_STEP_LO(29) - SQRT_STEP_LO(28) - SQRT_STEP_LOx4(24) - SQRT_STEP_LOx4(20) - SQRT_STEP_LOx4(16) - SQRT_STEP_LOx4(12) - SQRT_STEP_LOx4(8) - -#undef SQRT_STEP_LO -#undef SQRT_STEP_LOx4 - - "@ Put low 24 bits in the right order.\n\t" - "rbit r4, r4\n\t" - "\n\t" - "@ We have a 54-bit result; compute the 55-th bit as the 'sticky'\n\t" - "@ bit: it is non-zero if and only if r0:r1 is non-zero. We put the\n\t" - "@ three low bits (including the sticky bit) in r5.\n\t" - "orrs r0, r1\n\t" - "rsbs r1, r0, #0\n\t" - "orrs r0, r1\n\t" - "lsls r5, r4, #1\n\t" - "orrs r5, r5, r0, lsr #31\n\t" - "ands r5, #0x7\n\t" - "\n\t" - "@ Compute the rounding: r6 is set to 0 or 1, and will be added\n\t" - "@ to the mantissa.\n\t" - "movs r6, #0xC8\n\t" - "lsrs r6, r5\n\t" - "ands r6, #1\n\t" - "\n\t" - "@ Put the mantissa (53 bits, in the 2^52..2^53-1 range) in r0:r1\n\t" - "@ (rounding not applied yet).\n\t" - "lsrs r0, r4, #1\n\t" - "orrs r0, r0, r3, lsl #23\n\t" - "lsrs r1, r3, #9\n\t" - "\n\t" - "@ Compute new exponent. This is half the old one (then reencoded\n\t" - "@ by adding 1023). Exception: if the mantissa is zero, then the\n\t" - "@ encoded exponent is set to 0. At that point, if the mantissa\n\t" - "@ is non-zero, then its high bit (bit 52, i.e. bit 20 of r1) is\n\t" - "@ non-zero. Note that the exponent cannot go out of range.\n\t" - "lsrs r2, r2, #1\n\t" - "addw r2, r2, #1023\n\t" - "lsrs r5, r1, #20\n\t" - "rsbs r5, r5, #0\n\t" - "ands r2, r5\n\t" - "\n\t" - "@ Place exponent. This overwrites the high bit of the mantissa.\n\t" - "bfi r1, r2, #20, #11\n\t" - "\n\t" - "@ Apply rounding. This may create a carry that will spill into\n\t" - "@ the exponent, which is exactly what should be done in that case\n\t" - "@ (i.e. increment the exponent).\n\t" - "adds r0, r0, r6\n\t" - "adcs r1, r1, #0\n\t" - "\n\t" - "pop { r4, r5, r6, r7, r8, r10, r11, pc }\n\t" - ); -} - -#else // yyyASM_CORTEXM4+0 - fpr fpr_sqrt(fpr x) { @@ -1281,8 +500,6 @@ fpr_sqrt(fpr x) return FPR(0, e, q); } -#endif // yyyASM_CORTEXM4- - uint64_t fpr_expm_p63(fpr x, fpr ccs) { @@ -2409,1052 +1626,3 @@ const fpr fpr_p2_tab[] = { 4571153621781053440U, 4566650022153682944U }; - -#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1 - -const fpr fpr_gm_tab[] = { - {0}, {0}, /* unused */ - {-0.000000000000000000000000000}, { 1.000000000000000000000000000}, - { 0.707106781186547524400844362}, { 0.707106781186547524400844362}, - {-0.707106781186547524400844362}, { 0.707106781186547524400844362}, - { 0.923879532511286756128183189}, { 0.382683432365089771728459984}, - {-0.382683432365089771728459984}, { 0.923879532511286756128183189}, - { 0.382683432365089771728459984}, { 0.923879532511286756128183189}, - {-0.923879532511286756128183189}, { 0.382683432365089771728459984}, - { 0.980785280403230449126182236}, { 0.195090322016128267848284868}, - {-0.195090322016128267848284868}, { 0.980785280403230449126182236}, - { 0.555570233019602224742830814}, { 0.831469612302545237078788378}, - {-0.831469612302545237078788378}, { 0.555570233019602224742830814}, - { 0.831469612302545237078788378}, { 0.555570233019602224742830814}, - {-0.555570233019602224742830814}, { 0.831469612302545237078788378}, - { 0.195090322016128267848284868}, { 0.980785280403230449126182236}, - {-0.980785280403230449126182236}, { 0.195090322016128267848284868}, - { 0.995184726672196886244836953}, { 0.098017140329560601994195564}, - {-0.098017140329560601994195564}, { 0.995184726672196886244836953}, - { 0.634393284163645498215171613}, { 0.773010453362736960810906610}, - {-0.773010453362736960810906610}, { 0.634393284163645498215171613}, - { 0.881921264348355029712756864}, { 0.471396736825997648556387626}, - {-0.471396736825997648556387626}, { 0.881921264348355029712756864}, - { 0.290284677254462367636192376}, { 0.956940335732208864935797887}, - {-0.956940335732208864935797887}, { 0.290284677254462367636192376}, - { 0.956940335732208864935797887}, { 0.290284677254462367636192376}, - {-0.290284677254462367636192376}, { 0.956940335732208864935797887}, - { 0.471396736825997648556387626}, { 0.881921264348355029712756864}, - {-0.881921264348355029712756864}, { 0.471396736825997648556387626}, - { 0.773010453362736960810906610}, { 0.634393284163645498215171613}, - {-0.634393284163645498215171613}, { 0.773010453362736960810906610}, - { 0.098017140329560601994195564}, { 0.995184726672196886244836953}, - {-0.995184726672196886244836953}, { 0.098017140329560601994195564}, - { 0.998795456205172392714771605}, { 0.049067674327418014254954977}, - {-0.049067674327418014254954977}, { 0.998795456205172392714771605}, - { 0.671558954847018400625376850}, { 0.740951125354959091175616897}, - {-0.740951125354959091175616897}, { 0.671558954847018400625376850}, - { 0.903989293123443331586200297}, { 0.427555093430282094320966857}, - {-0.427555093430282094320966857}, { 0.903989293123443331586200297}, - { 0.336889853392220050689253213}, { 0.941544065183020778412509403}, - {-0.941544065183020778412509403}, { 0.336889853392220050689253213}, - { 0.970031253194543992603984207}, { 0.242980179903263889948274162}, - {-0.242980179903263889948274162}, { 0.970031253194543992603984207}, - { 0.514102744193221726593693839}, { 0.857728610000272069902269984}, - {-0.857728610000272069902269984}, { 0.514102744193221726593693839}, - { 0.803207531480644909806676513}, { 0.595699304492433343467036529}, - {-0.595699304492433343467036529}, { 0.803207531480644909806676513}, - { 0.146730474455361751658850130}, { 0.989176509964780973451673738}, - {-0.989176509964780973451673738}, { 0.146730474455361751658850130}, - { 0.989176509964780973451673738}, { 0.146730474455361751658850130}, - {-0.146730474455361751658850130}, { 0.989176509964780973451673738}, - { 0.595699304492433343467036529}, { 0.803207531480644909806676513}, - {-0.803207531480644909806676513}, { 0.595699304492433343467036529}, - { 0.857728610000272069902269984}, { 0.514102744193221726593693839}, - {-0.514102744193221726593693839}, { 0.857728610000272069902269984}, - { 0.242980179903263889948274162}, { 0.970031253194543992603984207}, - {-0.970031253194543992603984207}, { 0.242980179903263889948274162}, - { 0.941544065183020778412509403}, { 0.336889853392220050689253213}, - {-0.336889853392220050689253213}, { 0.941544065183020778412509403}, - { 0.427555093430282094320966857}, { 0.903989293123443331586200297}, - {-0.903989293123443331586200297}, { 0.427555093430282094320966857}, - { 0.740951125354959091175616897}, { 0.671558954847018400625376850}, - {-0.671558954847018400625376850}, { 0.740951125354959091175616897}, - { 0.049067674327418014254954977}, { 0.998795456205172392714771605}, - {-0.998795456205172392714771605}, { 0.049067674327418014254954977}, - { 0.999698818696204220115765650}, { 0.024541228522912288031734529}, - {-0.024541228522912288031734529}, { 0.999698818696204220115765650}, - { 0.689540544737066924616730630}, { 0.724247082951466920941069243}, - {-0.724247082951466920941069243}, { 0.689540544737066924616730630}, - { 0.914209755703530654635014829}, { 0.405241314004989870908481306}, - {-0.405241314004989870908481306}, { 0.914209755703530654635014829}, - { 0.359895036534988148775104572}, { 0.932992798834738887711660256}, - {-0.932992798834738887711660256}, { 0.359895036534988148775104572}, - { 0.975702130038528544460395766}, { 0.219101240156869797227737547}, - {-0.219101240156869797227737547}, { 0.975702130038528544460395766}, - { 0.534997619887097210663076905}, { 0.844853565249707073259571205}, - {-0.844853565249707073259571205}, { 0.534997619887097210663076905}, - { 0.817584813151583696504920884}, { 0.575808191417845300745972454}, - {-0.575808191417845300745972454}, { 0.817584813151583696504920884}, - { 0.170961888760301226363642357}, { 0.985277642388941244774018433}, - {-0.985277642388941244774018433}, { 0.170961888760301226363642357}, - { 0.992479534598709998156767252}, { 0.122410675199216198498704474}, - {-0.122410675199216198498704474}, { 0.992479534598709998156767252}, - { 0.615231590580626845484913563}, { 0.788346427626606262009164705}, - {-0.788346427626606262009164705}, { 0.615231590580626845484913563}, - { 0.870086991108711418652292404}, { 0.492898192229784036873026689}, - {-0.492898192229784036873026689}, { 0.870086991108711418652292404}, - { 0.266712757474898386325286515}, { 0.963776065795439866686464356}, - {-0.963776065795439866686464356}, { 0.266712757474898386325286515}, - { 0.949528180593036667195936074}, { 0.313681740398891476656478846}, - {-0.313681740398891476656478846}, { 0.949528180593036667195936074}, - { 0.449611329654606600046294579}, { 0.893224301195515320342416447}, - {-0.893224301195515320342416447}, { 0.449611329654606600046294579}, - { 0.757208846506484547575464054}, { 0.653172842953776764084203014}, - {-0.653172842953776764084203014}, { 0.757208846506484547575464054}, - { 0.073564563599667423529465622}, { 0.997290456678690216135597140}, - {-0.997290456678690216135597140}, { 0.073564563599667423529465622}, - { 0.997290456678690216135597140}, { 0.073564563599667423529465622}, - {-0.073564563599667423529465622}, { 0.997290456678690216135597140}, - { 0.653172842953776764084203014}, { 0.757208846506484547575464054}, - {-0.757208846506484547575464054}, { 0.653172842953776764084203014}, - { 0.893224301195515320342416447}, { 0.449611329654606600046294579}, - {-0.449611329654606600046294579}, { 0.893224301195515320342416447}, - { 0.313681740398891476656478846}, { 0.949528180593036667195936074}, - {-0.949528180593036667195936074}, { 0.313681740398891476656478846}, - { 0.963776065795439866686464356}, { 0.266712757474898386325286515}, - {-0.266712757474898386325286515}, { 0.963776065795439866686464356}, - { 0.492898192229784036873026689}, { 0.870086991108711418652292404}, - {-0.870086991108711418652292404}, { 0.492898192229784036873026689}, - { 0.788346427626606262009164705}, { 0.615231590580626845484913563}, - {-0.615231590580626845484913563}, { 0.788346427626606262009164705}, - { 0.122410675199216198498704474}, { 0.992479534598709998156767252}, - {-0.992479534598709998156767252}, { 0.122410675199216198498704474}, - { 0.985277642388941244774018433}, { 0.170961888760301226363642357}, - {-0.170961888760301226363642357}, { 0.985277642388941244774018433}, - { 0.575808191417845300745972454}, { 0.817584813151583696504920884}, - {-0.817584813151583696504920884}, { 0.575808191417845300745972454}, - { 0.844853565249707073259571205}, { 0.534997619887097210663076905}, - {-0.534997619887097210663076905}, { 0.844853565249707073259571205}, - { 0.219101240156869797227737547}, { 0.975702130038528544460395766}, - {-0.975702130038528544460395766}, { 0.219101240156869797227737547}, - { 0.932992798834738887711660256}, { 0.359895036534988148775104572}, - {-0.359895036534988148775104572}, { 0.932992798834738887711660256}, - { 0.405241314004989870908481306}, { 0.914209755703530654635014829}, - {-0.914209755703530654635014829}, { 0.405241314004989870908481306}, - { 0.724247082951466920941069243}, { 0.689540544737066924616730630}, - {-0.689540544737066924616730630}, { 0.724247082951466920941069243}, - { 0.024541228522912288031734529}, { 0.999698818696204220115765650}, - {-0.999698818696204220115765650}, { 0.024541228522912288031734529}, - { 0.999924701839144540921646491}, { 0.012271538285719926079408262}, - {-0.012271538285719926079408262}, { 0.999924701839144540921646491}, - { 0.698376249408972853554813503}, { 0.715730825283818654125532623}, - {-0.715730825283818654125532623}, { 0.698376249408972853554813503}, - { 0.919113851690057743908477789}, { 0.393992040061048108596188661}, - {-0.393992040061048108596188661}, { 0.919113851690057743908477789}, - { 0.371317193951837543411934967}, { 0.928506080473215565937167396}, - {-0.928506080473215565937167396}, { 0.371317193951837543411934967}, - { 0.978317370719627633106240097}, { 0.207111376192218549708116020}, - {-0.207111376192218549708116020}, { 0.978317370719627633106240097}, - { 0.545324988422046422313987347}, { 0.838224705554838043186996856}, - {-0.838224705554838043186996856}, { 0.545324988422046422313987347}, - { 0.824589302785025264474803737}, { 0.565731810783613197389765011}, - {-0.565731810783613197389765011}, { 0.824589302785025264474803737}, - { 0.183039887955140958516532578}, { 0.983105487431216327180301155}, - {-0.983105487431216327180301155}, { 0.183039887955140958516532578}, - { 0.993906970002356041546922813}, { 0.110222207293883058807899140}, - {-0.110222207293883058807899140}, { 0.993906970002356041546922813}, - { 0.624859488142386377084072816}, { 0.780737228572094478301588484}, - {-0.780737228572094478301588484}, { 0.624859488142386377084072816}, - { 0.876070094195406607095844268}, { 0.482183772079122748517344481}, - {-0.482183772079122748517344481}, { 0.876070094195406607095844268}, - { 0.278519689385053105207848526}, { 0.960430519415565811199035138}, - {-0.960430519415565811199035138}, { 0.278519689385053105207848526}, - { 0.953306040354193836916740383}, { 0.302005949319228067003463232}, - {-0.302005949319228067003463232}, { 0.953306040354193836916740383}, - { 0.460538710958240023633181487}, { 0.887639620402853947760181617}, - {-0.887639620402853947760181617}, { 0.460538710958240023633181487}, - { 0.765167265622458925888815999}, { 0.643831542889791465068086063}, - {-0.643831542889791465068086063}, { 0.765167265622458925888815999}, - { 0.085797312344439890461556332}, { 0.996312612182778012627226190}, - {-0.996312612182778012627226190}, { 0.085797312344439890461556332}, - { 0.998118112900149207125155861}, { 0.061320736302208577782614593}, - {-0.061320736302208577782614593}, { 0.998118112900149207125155861}, - { 0.662415777590171761113069817}, { 0.749136394523459325469203257}, - {-0.749136394523459325469203257}, { 0.662415777590171761113069817}, - { 0.898674465693953843041976744}, { 0.438616238538527637647025738}, - {-0.438616238538527637647025738}, { 0.898674465693953843041976744}, - { 0.325310292162262934135954708}, { 0.945607325380521325730945387}, - {-0.945607325380521325730945387}, { 0.325310292162262934135954708}, - { 0.966976471044852109087220226}, { 0.254865659604514571553980779}, - {-0.254865659604514571553980779}, { 0.966976471044852109087220226}, - { 0.503538383725717558691867071}, { 0.863972856121586737918147054}, - {-0.863972856121586737918147054}, { 0.503538383725717558691867071}, - { 0.795836904608883536262791915}, { 0.605511041404325513920626941}, - {-0.605511041404325513920626941}, { 0.795836904608883536262791915}, - { 0.134580708507126186316358409}, { 0.990902635427780025108237011}, - {-0.990902635427780025108237011}, { 0.134580708507126186316358409}, - { 0.987301418157858382399815802}, { 0.158858143333861441684385360}, - {-0.158858143333861441684385360}, { 0.987301418157858382399815802}, - { 0.585797857456438860328080838}, { 0.810457198252594791726703434}, - {-0.810457198252594791726703434}, { 0.585797857456438860328080838}, - { 0.851355193105265142261290312}, { 0.524589682678468906215098464}, - {-0.524589682678468906215098464}, { 0.851355193105265142261290312}, - { 0.231058108280671119643236018}, { 0.972939952205560145467720114}, - {-0.972939952205560145467720114}, { 0.231058108280671119643236018}, - { 0.937339011912574923201899593}, { 0.348418680249434568419308588}, - {-0.348418680249434568419308588}, { 0.937339011912574923201899593}, - { 0.416429560097637182562598911}, { 0.909167983090522376563884788}, - {-0.909167983090522376563884788}, { 0.416429560097637182562598911}, - { 0.732654271672412834615546649}, { 0.680600997795453050594430464}, - {-0.680600997795453050594430464}, { 0.732654271672412834615546649}, - { 0.036807222941358832324332691}, { 0.999322384588349500896221011}, - {-0.999322384588349500896221011}, { 0.036807222941358832324332691}, - { 0.999322384588349500896221011}, { 0.036807222941358832324332691}, - {-0.036807222941358832324332691}, { 0.999322384588349500896221011}, - { 0.680600997795453050594430464}, { 0.732654271672412834615546649}, - {-0.732654271672412834615546649}, { 0.680600997795453050594430464}, - { 0.909167983090522376563884788}, { 0.416429560097637182562598911}, - {-0.416429560097637182562598911}, { 0.909167983090522376563884788}, - { 0.348418680249434568419308588}, { 0.937339011912574923201899593}, - {-0.937339011912574923201899593}, { 0.348418680249434568419308588}, - { 0.972939952205560145467720114}, { 0.231058108280671119643236018}, - {-0.231058108280671119643236018}, { 0.972939952205560145467720114}, - { 0.524589682678468906215098464}, { 0.851355193105265142261290312}, - {-0.851355193105265142261290312}, { 0.524589682678468906215098464}, - { 0.810457198252594791726703434}, { 0.585797857456438860328080838}, - {-0.585797857456438860328080838}, { 0.810457198252594791726703434}, - { 0.158858143333861441684385360}, { 0.987301418157858382399815802}, - {-0.987301418157858382399815802}, { 0.158858143333861441684385360}, - { 0.990902635427780025108237011}, { 0.134580708507126186316358409}, - {-0.134580708507126186316358409}, { 0.990902635427780025108237011}, - { 0.605511041404325513920626941}, { 0.795836904608883536262791915}, - {-0.795836904608883536262791915}, { 0.605511041404325513920626941}, - { 0.863972856121586737918147054}, { 0.503538383725717558691867071}, - {-0.503538383725717558691867071}, { 0.863972856121586737918147054}, - { 0.254865659604514571553980779}, { 0.966976471044852109087220226}, - {-0.966976471044852109087220226}, { 0.254865659604514571553980779}, - { 0.945607325380521325730945387}, { 0.325310292162262934135954708}, - {-0.325310292162262934135954708}, { 0.945607325380521325730945387}, - { 0.438616238538527637647025738}, { 0.898674465693953843041976744}, - {-0.898674465693953843041976744}, { 0.438616238538527637647025738}, - { 0.749136394523459325469203257}, { 0.662415777590171761113069817}, - {-0.662415777590171761113069817}, { 0.749136394523459325469203257}, - { 0.061320736302208577782614593}, { 0.998118112900149207125155861}, - {-0.998118112900149207125155861}, { 0.061320736302208577782614593}, - { 0.996312612182778012627226190}, { 0.085797312344439890461556332}, - {-0.085797312344439890461556332}, { 0.996312612182778012627226190}, - { 0.643831542889791465068086063}, { 0.765167265622458925888815999}, - {-0.765167265622458925888815999}, { 0.643831542889791465068086063}, - { 0.887639620402853947760181617}, { 0.460538710958240023633181487}, - {-0.460538710958240023633181487}, { 0.887639620402853947760181617}, - { 0.302005949319228067003463232}, { 0.953306040354193836916740383}, - {-0.953306040354193836916740383}, { 0.302005949319228067003463232}, - { 0.960430519415565811199035138}, { 0.278519689385053105207848526}, - {-0.278519689385053105207848526}, { 0.960430519415565811199035138}, - { 0.482183772079122748517344481}, { 0.876070094195406607095844268}, - {-0.876070094195406607095844268}, { 0.482183772079122748517344481}, - { 0.780737228572094478301588484}, { 0.624859488142386377084072816}, - {-0.624859488142386377084072816}, { 0.780737228572094478301588484}, - { 0.110222207293883058807899140}, { 0.993906970002356041546922813}, - {-0.993906970002356041546922813}, { 0.110222207293883058807899140}, - { 0.983105487431216327180301155}, { 0.183039887955140958516532578}, - {-0.183039887955140958516532578}, { 0.983105487431216327180301155}, - { 0.565731810783613197389765011}, { 0.824589302785025264474803737}, - {-0.824589302785025264474803737}, { 0.565731810783613197389765011}, - { 0.838224705554838043186996856}, { 0.545324988422046422313987347}, - {-0.545324988422046422313987347}, { 0.838224705554838043186996856}, - { 0.207111376192218549708116020}, { 0.978317370719627633106240097}, - {-0.978317370719627633106240097}, { 0.207111376192218549708116020}, - { 0.928506080473215565937167396}, { 0.371317193951837543411934967}, - {-0.371317193951837543411934967}, { 0.928506080473215565937167396}, - { 0.393992040061048108596188661}, { 0.919113851690057743908477789}, - {-0.919113851690057743908477789}, { 0.393992040061048108596188661}, - { 0.715730825283818654125532623}, { 0.698376249408972853554813503}, - {-0.698376249408972853554813503}, { 0.715730825283818654125532623}, - { 0.012271538285719926079408262}, { 0.999924701839144540921646491}, - {-0.999924701839144540921646491}, { 0.012271538285719926079408262}, - { 0.999981175282601142656990438}, { 0.006135884649154475359640235}, - {-0.006135884649154475359640235}, { 0.999981175282601142656990438}, - { 0.702754744457225302452914421}, { 0.711432195745216441522130290}, - {-0.711432195745216441522130290}, { 0.702754744457225302452914421}, - { 0.921514039342041943465396332}, { 0.388345046698826291624993541}, - {-0.388345046698826291624993541}, { 0.921514039342041943465396332}, - { 0.377007410216418256726567823}, { 0.926210242138311341974793388}, - {-0.926210242138311341974793388}, { 0.377007410216418256726567823}, - { 0.979569765685440534439326110}, { 0.201104634842091911558443546}, - {-0.201104634842091911558443546}, { 0.979569765685440534439326110}, - { 0.550457972936604802977289893}, { 0.834862874986380056304401383}, - {-0.834862874986380056304401383}, { 0.550457972936604802977289893}, - { 0.828045045257755752067527592}, { 0.560661576197336023839710223}, - {-0.560661576197336023839710223}, { 0.828045045257755752067527592}, - { 0.189068664149806212754997837}, { 0.981963869109555264072848154}, - {-0.981963869109555264072848154}, { 0.189068664149806212754997837}, - { 0.994564570734255452119106243}, { 0.104121633872054579120943880}, - {-0.104121633872054579120943880}, { 0.994564570734255452119106243}, - { 0.629638238914927025372981341}, { 0.776888465673232450040827983}, - {-0.776888465673232450040827983}, { 0.629638238914927025372981341}, - { 0.879012226428633477831323711}, { 0.476799230063322133342158117}, - {-0.476799230063322133342158117}, { 0.879012226428633477831323711}, - { 0.284407537211271843618310615}, { 0.958703474895871555374645792}, - {-0.958703474895871555374645792}, { 0.284407537211271843618310615}, - { 0.955141168305770721498157712}, { 0.296150888243623824121786128}, - {-0.296150888243623824121786128}, { 0.955141168305770721498157712}, - { 0.465976495767966177902756065}, { 0.884797098430937780104007041}, - {-0.884797098430937780104007041}, { 0.465976495767966177902756065}, - { 0.769103337645579639346626069}, { 0.639124444863775743801488193}, - {-0.639124444863775743801488193}, { 0.769103337645579639346626069}, - { 0.091908956497132728624990979}, { 0.995767414467659793982495643}, - {-0.995767414467659793982495643}, { 0.091908956497132728624990979}, - { 0.998475580573294752208559038}, { 0.055195244349689939809447526}, - {-0.055195244349689939809447526}, { 0.998475580573294752208559038}, - { 0.666999922303637506650154222}, { 0.745057785441465962407907310}, - {-0.745057785441465962407907310}, { 0.666999922303637506650154222}, - { 0.901348847046022014570746093}, { 0.433093818853151968484222638}, - {-0.433093818853151968484222638}, { 0.901348847046022014570746093}, - { 0.331106305759876401737190737}, { 0.943593458161960361495301445}, - {-0.943593458161960361495301445}, { 0.331106305759876401737190737}, - { 0.968522094274417316221088329}, { 0.248927605745720168110682816}, - {-0.248927605745720168110682816}, { 0.968522094274417316221088329}, - { 0.508830142543107036931749324}, { 0.860866938637767279344583877}, - {-0.860866938637767279344583877}, { 0.508830142543107036931749324}, - { 0.799537269107905033500246232}, { 0.600616479383868926653875896}, - {-0.600616479383868926653875896}, { 0.799537269107905033500246232}, - { 0.140658239332849230714788846}, { 0.990058210262297105505906464}, - {-0.990058210262297105505906464}, { 0.140658239332849230714788846}, - { 0.988257567730749491404792538}, { 0.152797185258443427720336613}, - {-0.152797185258443427720336613}, { 0.988257567730749491404792538}, - { 0.590759701858874228423887908}, { 0.806847553543799272206514313}, - {-0.806847553543799272206514313}, { 0.590759701858874228423887908}, - { 0.854557988365400520767862276}, { 0.519355990165589587361829932}, - {-0.519355990165589587361829932}, { 0.854557988365400520767862276}, - { 0.237023605994367206867735915}, { 0.971503890986251775537099622}, - {-0.971503890986251775537099622}, { 0.237023605994367206867735915}, - { 0.939459223602189911962669246}, { 0.342660717311994397592781983}, - {-0.342660717311994397592781983}, { 0.939459223602189911962669246}, - { 0.422000270799799685941287941}, { 0.906595704514915365332960588}, - {-0.906595704514915365332960588}, { 0.422000270799799685941287941}, - { 0.736816568877369875090132520}, { 0.676092703575315960360419228}, - {-0.676092703575315960360419228}, { 0.736816568877369875090132520}, - { 0.042938256934940823077124540}, { 0.999077727752645382888781997}, - {-0.999077727752645382888781997}, { 0.042938256934940823077124540}, - { 0.999529417501093163079703322}, { 0.030674803176636625934021028}, - {-0.030674803176636625934021028}, { 0.999529417501093163079703322}, - { 0.685083667772700381362052545}, { 0.728464390448225196492035438}, - {-0.728464390448225196492035438}, { 0.685083667772700381362052545}, - { 0.911706032005429851404397325}, { 0.410843171057903942183466675}, - {-0.410843171057903942183466675}, { 0.911706032005429851404397325}, - { 0.354163525420490382357395796}, { 0.935183509938947577642207480}, - {-0.935183509938947577642207480}, { 0.354163525420490382357395796}, - { 0.974339382785575860518721668}, { 0.225083911359792835991642120}, - {-0.225083911359792835991642120}, { 0.974339382785575860518721668}, - { 0.529803624686294668216054671}, { 0.848120344803297251279133563}, - {-0.848120344803297251279133563}, { 0.529803624686294668216054671}, - { 0.814036329705948361654516690}, { 0.580813958095764545075595272}, - {-0.580813958095764545075595272}, { 0.814036329705948361654516690}, - { 0.164913120489969921418189113}, { 0.986308097244598647863297524}, - {-0.986308097244598647863297524}, { 0.164913120489969921418189113}, - { 0.991709753669099522860049931}, { 0.128498110793793172624415589}, - {-0.128498110793793172624415589}, { 0.991709753669099522860049931}, - { 0.610382806276309452716352152}, { 0.792106577300212351782342879}, - {-0.792106577300212351782342879}, { 0.610382806276309452716352152}, - { 0.867046245515692651480195629}, { 0.498227666972781852410983869}, - {-0.498227666972781852410983869}, { 0.867046245515692651480195629}, - { 0.260794117915275518280186509}, { 0.965394441697689374550843858}, - {-0.965394441697689374550843858}, { 0.260794117915275518280186509}, - { 0.947585591017741134653387321}, { 0.319502030816015677901518272}, - {-0.319502030816015677901518272}, { 0.947585591017741134653387321}, - { 0.444122144570429231642069418}, { 0.895966249756185155914560282}, - {-0.895966249756185155914560282}, { 0.444122144570429231642069418}, - { 0.753186799043612482483430486}, { 0.657806693297078656931182264}, - {-0.657806693297078656931182264}, { 0.753186799043612482483430486}, - { 0.067443919563664057897972422}, { 0.997723066644191609848546728}, - {-0.997723066644191609848546728}, { 0.067443919563664057897972422}, - { 0.996820299291165714972629398}, { 0.079682437971430121147120656}, - {-0.079682437971430121147120656}, { 0.996820299291165714972629398}, - { 0.648514401022112445084560551}, { 0.761202385484261814029709836}, - {-0.761202385484261814029709836}, { 0.648514401022112445084560551}, - { 0.890448723244757889952150560}, { 0.455083587126343823535869268}, - {-0.455083587126343823535869268}, { 0.890448723244757889952150560}, - { 0.307849640041534893682063646}, { 0.951435020969008369549175569}, - {-0.951435020969008369549175569}, { 0.307849640041534893682063646}, - { 0.962121404269041595429604316}, { 0.272621355449948984493347477}, - {-0.272621355449948984493347477}, { 0.962121404269041595429604316}, - { 0.487550160148435954641485027}, { 0.873094978418290098636085973}, - {-0.873094978418290098636085973}, { 0.487550160148435954641485027}, - { 0.784556597155575233023892575}, { 0.620057211763289178646268191}, - {-0.620057211763289178646268191}, { 0.784556597155575233023892575}, - { 0.116318630911904767252544319}, { 0.993211949234794533104601012}, - {-0.993211949234794533104601012}, { 0.116318630911904767252544319}, - { 0.984210092386929073193874387}, { 0.177004220412148756196839844}, - {-0.177004220412148756196839844}, { 0.984210092386929073193874387}, - { 0.570780745886967280232652864}, { 0.821102514991104679060430820}, - {-0.821102514991104679060430820}, { 0.570780745886967280232652864}, - { 0.841554977436898409603499520}, { 0.540171472729892881297845480}, - {-0.540171472729892881297845480}, { 0.841554977436898409603499520}, - { 0.213110319916091373967757518}, { 0.977028142657754351485866211}, - {-0.977028142657754351485866211}, { 0.213110319916091373967757518}, - { 0.930766961078983731944872340}, { 0.365612997804773870011745909}, - {-0.365612997804773870011745909}, { 0.930766961078983731944872340}, - { 0.399624199845646828544117031}, { 0.916679059921042663116457013}, - {-0.916679059921042663116457013}, { 0.399624199845646828544117031}, - { 0.720002507961381629076682999}, { 0.693971460889654009003734389}, - {-0.693971460889654009003734389}, { 0.720002507961381629076682999}, - { 0.018406729905804820927366313}, { 0.999830581795823422015722275}, - {-0.999830581795823422015722275}, { 0.018406729905804820927366313}, - { 0.999830581795823422015722275}, { 0.018406729905804820927366313}, - {-0.018406729905804820927366313}, { 0.999830581795823422015722275}, - { 0.693971460889654009003734389}, { 0.720002507961381629076682999}, - {-0.720002507961381629076682999}, { 0.693971460889654009003734389}, - { 0.916679059921042663116457013}, { 0.399624199845646828544117031}, - {-0.399624199845646828544117031}, { 0.916679059921042663116457013}, - { 0.365612997804773870011745909}, { 0.930766961078983731944872340}, - {-0.930766961078983731944872340}, { 0.365612997804773870011745909}, - { 0.977028142657754351485866211}, { 0.213110319916091373967757518}, - {-0.213110319916091373967757518}, { 0.977028142657754351485866211}, - { 0.540171472729892881297845480}, { 0.841554977436898409603499520}, - {-0.841554977436898409603499520}, { 0.540171472729892881297845480}, - { 0.821102514991104679060430820}, { 0.570780745886967280232652864}, - {-0.570780745886967280232652864}, { 0.821102514991104679060430820}, - { 0.177004220412148756196839844}, { 0.984210092386929073193874387}, - {-0.984210092386929073193874387}, { 0.177004220412148756196839844}, - { 0.993211949234794533104601012}, { 0.116318630911904767252544319}, - {-0.116318630911904767252544319}, { 0.993211949234794533104601012}, - { 0.620057211763289178646268191}, { 0.784556597155575233023892575}, - {-0.784556597155575233023892575}, { 0.620057211763289178646268191}, - { 0.873094978418290098636085973}, { 0.487550160148435954641485027}, - {-0.487550160148435954641485027}, { 0.873094978418290098636085973}, - { 0.272621355449948984493347477}, { 0.962121404269041595429604316}, - {-0.962121404269041595429604316}, { 0.272621355449948984493347477}, - { 0.951435020969008369549175569}, { 0.307849640041534893682063646}, - {-0.307849640041534893682063646}, { 0.951435020969008369549175569}, - { 0.455083587126343823535869268}, { 0.890448723244757889952150560}, - {-0.890448723244757889952150560}, { 0.455083587126343823535869268}, - { 0.761202385484261814029709836}, { 0.648514401022112445084560551}, - {-0.648514401022112445084560551}, { 0.761202385484261814029709836}, - { 0.079682437971430121147120656}, { 0.996820299291165714972629398}, - {-0.996820299291165714972629398}, { 0.079682437971430121147120656}, - { 0.997723066644191609848546728}, { 0.067443919563664057897972422}, - {-0.067443919563664057897972422}, { 0.997723066644191609848546728}, - { 0.657806693297078656931182264}, { 0.753186799043612482483430486}, - {-0.753186799043612482483430486}, { 0.657806693297078656931182264}, - { 0.895966249756185155914560282}, { 0.444122144570429231642069418}, - {-0.444122144570429231642069418}, { 0.895966249756185155914560282}, - { 0.319502030816015677901518272}, { 0.947585591017741134653387321}, - {-0.947585591017741134653387321}, { 0.319502030816015677901518272}, - { 0.965394441697689374550843858}, { 0.260794117915275518280186509}, - {-0.260794117915275518280186509}, { 0.965394441697689374550843858}, - { 0.498227666972781852410983869}, { 0.867046245515692651480195629}, - {-0.867046245515692651480195629}, { 0.498227666972781852410983869}, - { 0.792106577300212351782342879}, { 0.610382806276309452716352152}, - {-0.610382806276309452716352152}, { 0.792106577300212351782342879}, - { 0.128498110793793172624415589}, { 0.991709753669099522860049931}, - {-0.991709753669099522860049931}, { 0.128498110793793172624415589}, - { 0.986308097244598647863297524}, { 0.164913120489969921418189113}, - {-0.164913120489969921418189113}, { 0.986308097244598647863297524}, - { 0.580813958095764545075595272}, { 0.814036329705948361654516690}, - {-0.814036329705948361654516690}, { 0.580813958095764545075595272}, - { 0.848120344803297251279133563}, { 0.529803624686294668216054671}, - {-0.529803624686294668216054671}, { 0.848120344803297251279133563}, - { 0.225083911359792835991642120}, { 0.974339382785575860518721668}, - {-0.974339382785575860518721668}, { 0.225083911359792835991642120}, - { 0.935183509938947577642207480}, { 0.354163525420490382357395796}, - {-0.354163525420490382357395796}, { 0.935183509938947577642207480}, - { 0.410843171057903942183466675}, { 0.911706032005429851404397325}, - {-0.911706032005429851404397325}, { 0.410843171057903942183466675}, - { 0.728464390448225196492035438}, { 0.685083667772700381362052545}, - {-0.685083667772700381362052545}, { 0.728464390448225196492035438}, - { 0.030674803176636625934021028}, { 0.999529417501093163079703322}, - {-0.999529417501093163079703322}, { 0.030674803176636625934021028}, - { 0.999077727752645382888781997}, { 0.042938256934940823077124540}, - {-0.042938256934940823077124540}, { 0.999077727752645382888781997}, - { 0.676092703575315960360419228}, { 0.736816568877369875090132520}, - {-0.736816568877369875090132520}, { 0.676092703575315960360419228}, - { 0.906595704514915365332960588}, { 0.422000270799799685941287941}, - {-0.422000270799799685941287941}, { 0.906595704514915365332960588}, - { 0.342660717311994397592781983}, { 0.939459223602189911962669246}, - {-0.939459223602189911962669246}, { 0.342660717311994397592781983}, - { 0.971503890986251775537099622}, { 0.237023605994367206867735915}, - {-0.237023605994367206867735915}, { 0.971503890986251775537099622}, - { 0.519355990165589587361829932}, { 0.854557988365400520767862276}, - {-0.854557988365400520767862276}, { 0.519355990165589587361829932}, - { 0.806847553543799272206514313}, { 0.590759701858874228423887908}, - {-0.590759701858874228423887908}, { 0.806847553543799272206514313}, - { 0.152797185258443427720336613}, { 0.988257567730749491404792538}, - {-0.988257567730749491404792538}, { 0.152797185258443427720336613}, - { 0.990058210262297105505906464}, { 0.140658239332849230714788846}, - {-0.140658239332849230714788846}, { 0.990058210262297105505906464}, - { 0.600616479383868926653875896}, { 0.799537269107905033500246232}, - {-0.799537269107905033500246232}, { 0.600616479383868926653875896}, - { 0.860866938637767279344583877}, { 0.508830142543107036931749324}, - {-0.508830142543107036931749324}, { 0.860866938637767279344583877}, - { 0.248927605745720168110682816}, { 0.968522094274417316221088329}, - {-0.968522094274417316221088329}, { 0.248927605745720168110682816}, - { 0.943593458161960361495301445}, { 0.331106305759876401737190737}, - {-0.331106305759876401737190737}, { 0.943593458161960361495301445}, - { 0.433093818853151968484222638}, { 0.901348847046022014570746093}, - {-0.901348847046022014570746093}, { 0.433093818853151968484222638}, - { 0.745057785441465962407907310}, { 0.666999922303637506650154222}, - {-0.666999922303637506650154222}, { 0.745057785441465962407907310}, - { 0.055195244349689939809447526}, { 0.998475580573294752208559038}, - {-0.998475580573294752208559038}, { 0.055195244349689939809447526}, - { 0.995767414467659793982495643}, { 0.091908956497132728624990979}, - {-0.091908956497132728624990979}, { 0.995767414467659793982495643}, - { 0.639124444863775743801488193}, { 0.769103337645579639346626069}, - {-0.769103337645579639346626069}, { 0.639124444863775743801488193}, - { 0.884797098430937780104007041}, { 0.465976495767966177902756065}, - {-0.465976495767966177902756065}, { 0.884797098430937780104007041}, - { 0.296150888243623824121786128}, { 0.955141168305770721498157712}, - {-0.955141168305770721498157712}, { 0.296150888243623824121786128}, - { 0.958703474895871555374645792}, { 0.284407537211271843618310615}, - {-0.284407537211271843618310615}, { 0.958703474895871555374645792}, - { 0.476799230063322133342158117}, { 0.879012226428633477831323711}, - {-0.879012226428633477831323711}, { 0.476799230063322133342158117}, - { 0.776888465673232450040827983}, { 0.629638238914927025372981341}, - {-0.629638238914927025372981341}, { 0.776888465673232450040827983}, - { 0.104121633872054579120943880}, { 0.994564570734255452119106243}, - {-0.994564570734255452119106243}, { 0.104121633872054579120943880}, - { 0.981963869109555264072848154}, { 0.189068664149806212754997837}, - {-0.189068664149806212754997837}, { 0.981963869109555264072848154}, - { 0.560661576197336023839710223}, { 0.828045045257755752067527592}, - {-0.828045045257755752067527592}, { 0.560661576197336023839710223}, - { 0.834862874986380056304401383}, { 0.550457972936604802977289893}, - {-0.550457972936604802977289893}, { 0.834862874986380056304401383}, - { 0.201104634842091911558443546}, { 0.979569765685440534439326110}, - {-0.979569765685440534439326110}, { 0.201104634842091911558443546}, - { 0.926210242138311341974793388}, { 0.377007410216418256726567823}, - {-0.377007410216418256726567823}, { 0.926210242138311341974793388}, - { 0.388345046698826291624993541}, { 0.921514039342041943465396332}, - {-0.921514039342041943465396332}, { 0.388345046698826291624993541}, - { 0.711432195745216441522130290}, { 0.702754744457225302452914421}, - {-0.702754744457225302452914421}, { 0.711432195745216441522130290}, - { 0.006135884649154475359640235}, { 0.999981175282601142656990438}, - {-0.999981175282601142656990438}, { 0.006135884649154475359640235}, - { 0.999995293809576171511580126}, { 0.003067956762965976270145365}, - {-0.003067956762965976270145365}, { 0.999995293809576171511580126}, - { 0.704934080375904908852523758}, { 0.709272826438865651316533772}, - {-0.709272826438865651316533772}, { 0.704934080375904908852523758}, - { 0.922701128333878570437264227}, { 0.385516053843918864075607949}, - {-0.385516053843918864075607949}, { 0.922701128333878570437264227}, - { 0.379847208924051170576281147}, { 0.925049240782677590302371869}, - {-0.925049240782677590302371869}, { 0.379847208924051170576281147}, - { 0.980182135968117392690210009}, { 0.198098410717953586179324918}, - {-0.198098410717953586179324918}, { 0.980182135968117392690210009}, - { 0.553016705580027531764226988}, { 0.833170164701913186439915922}, - {-0.833170164701913186439915922}, { 0.553016705580027531764226988}, - { 0.829761233794523042469023765}, { 0.558118531220556115693702964}, - {-0.558118531220556115693702964}, { 0.829761233794523042469023765}, - { 0.192080397049892441679288205}, { 0.981379193313754574318224190}, - {-0.981379193313754574318224190}, { 0.192080397049892441679288205}, - { 0.994879330794805620591166107}, { 0.101069862754827824987887585}, - {-0.101069862754827824987887585}, { 0.994879330794805620591166107}, - { 0.632018735939809021909403706}, { 0.774953106594873878359129282}, - {-0.774953106594873878359129282}, { 0.632018735939809021909403706}, - { 0.880470889052160770806542929}, { 0.474100214650550014398580015}, - {-0.474100214650550014398580015}, { 0.880470889052160770806542929}, - { 0.287347459544729526477331841}, { 0.957826413027532890321037029}, - {-0.957826413027532890321037029}, { 0.287347459544729526477331841}, - { 0.956045251349996443270479823}, { 0.293219162694258650606608599}, - {-0.293219162694258650606608599}, { 0.956045251349996443270479823}, - { 0.468688822035827933697617870}, { 0.883363338665731594736308015}, - {-0.883363338665731594736308015}, { 0.468688822035827933697617870}, - { 0.771060524261813773200605759}, { 0.636761861236284230413943435}, - {-0.636761861236284230413943435}, { 0.771060524261813773200605759}, - { 0.094963495329638998938034312}, { 0.995480755491926941769171600}, - {-0.995480755491926941769171600}, { 0.094963495329638998938034312}, - { 0.998640218180265222418199049}, { 0.052131704680283321236358216}, - {-0.052131704680283321236358216}, { 0.998640218180265222418199049}, - { 0.669282588346636065720696366}, { 0.743007952135121693517362293}, - {-0.743007952135121693517362293}, { 0.669282588346636065720696366}, - { 0.902673318237258806751502391}, { 0.430326481340082633908199031}, - {-0.430326481340082633908199031}, { 0.902673318237258806751502391}, - { 0.333999651442009404650865481}, { 0.942573197601446879280758735}, - {-0.942573197601446879280758735}, { 0.333999651442009404650865481}, - { 0.969281235356548486048290738}, { 0.245955050335794611599924709}, - {-0.245955050335794611599924709}, { 0.969281235356548486048290738}, - { 0.511468850437970399504391001}, { 0.859301818357008404783582139}, - {-0.859301818357008404783582139}, { 0.511468850437970399504391001}, - { 0.801376171723140219430247777}, { 0.598160706996342311724958652}, - {-0.598160706996342311724958652}, { 0.801376171723140219430247777}, - { 0.143695033150294454819773349}, { 0.989622017463200834623694454}, - {-0.989622017463200834623694454}, { 0.143695033150294454819773349}, - { 0.988721691960323767604516485}, { 0.149764534677321517229695737}, - {-0.149764534677321517229695737}, { 0.988721691960323767604516485}, - { 0.593232295039799808047809426}, { 0.805031331142963597922659282}, - {-0.805031331142963597922659282}, { 0.593232295039799808047809426}, - { 0.856147328375194481019630732}, { 0.516731799017649881508753876}, - {-0.516731799017649881508753876}, { 0.856147328375194481019630732}, - { 0.240003022448741486568922365}, { 0.970772140728950302138169611}, - {-0.970772140728950302138169611}, { 0.240003022448741486568922365}, - { 0.940506070593268323787291309}, { 0.339776884406826857828825803}, - {-0.339776884406826857828825803}, { 0.940506070593268323787291309}, - { 0.424779681209108833357226189}, { 0.905296759318118774354048329}, - {-0.905296759318118774354048329}, { 0.424779681209108833357226189}, - { 0.738887324460615147933116508}, { 0.673829000378756060917568372}, - {-0.673829000378756060917568372}, { 0.738887324460615147933116508}, - { 0.046003182130914628814301788}, { 0.998941293186856850633930266}, - {-0.998941293186856850633930266}, { 0.046003182130914628814301788}, - { 0.999618822495178597116830637}, { 0.027608145778965741612354872}, - {-0.027608145778965741612354872}, { 0.999618822495178597116830637}, - { 0.687315340891759108199186948}, { 0.726359155084345976817494315}, - {-0.726359155084345976817494315}, { 0.687315340891759108199186948}, - { 0.912962190428398164628018233}, { 0.408044162864978680820747499}, - {-0.408044162864978680820747499}, { 0.912962190428398164628018233}, - { 0.357030961233430032614954036}, { 0.934092550404258914729877883}, - {-0.934092550404258914729877883}, { 0.357030961233430032614954036}, - { 0.975025345066994146844913468}, { 0.222093620973203534094094721}, - {-0.222093620973203534094094721}, { 0.975025345066994146844913468}, - { 0.532403127877197971442805218}, { 0.846490938774052078300544488}, - {-0.846490938774052078300544488}, { 0.532403127877197971442805218}, - { 0.815814410806733789010772660}, { 0.578313796411655563342245019}, - {-0.578313796411655563342245019}, { 0.815814410806733789010772660}, - { 0.167938294974731178054745536}, { 0.985797509167567424700995000}, - {-0.985797509167567424700995000}, { 0.167938294974731178054745536}, - { 0.992099313142191757112085445}, { 0.125454983411546238542336453}, - {-0.125454983411546238542336453}, { 0.992099313142191757112085445}, - { 0.612810082429409703935211936}, { 0.790230221437310055030217152}, - {-0.790230221437310055030217152}, { 0.612810082429409703935211936}, - { 0.868570705971340895340449876}, { 0.495565261825772531150266670}, - {-0.495565261825772531150266670}, { 0.868570705971340895340449876}, - { 0.263754678974831383611349322}, { 0.964589793289812723836432159}, - {-0.964589793289812723836432159}, { 0.263754678974831383611349322}, - { 0.948561349915730288158494826}, { 0.316593375556165867243047035}, - {-0.316593375556165867243047035}, { 0.948561349915730288158494826}, - { 0.446868840162374195353044389}, { 0.894599485631382678433072126}, - {-0.894599485631382678433072126}, { 0.446868840162374195353044389}, - { 0.755201376896536527598710756}, { 0.655492852999615385312679701}, - {-0.655492852999615385312679701}, { 0.755201376896536527598710756}, - { 0.070504573389613863027351471}, { 0.997511456140303459699448390}, - {-0.997511456140303459699448390}, { 0.070504573389613863027351471}, - { 0.997060070339482978987989949}, { 0.076623861392031492278332463}, - {-0.076623861392031492278332463}, { 0.997060070339482978987989949}, - { 0.650846684996380915068975573}, { 0.759209188978388033485525443}, - {-0.759209188978388033485525443}, { 0.650846684996380915068975573}, - { 0.891840709392342727796478697}, { 0.452349587233770874133026703}, - {-0.452349587233770874133026703}, { 0.891840709392342727796478697}, - { 0.310767152749611495835997250}, { 0.950486073949481721759926101}, - {-0.950486073949481721759926101}, { 0.310767152749611495835997250}, - { 0.962953266873683886347921481}, { 0.269668325572915106525464462}, - {-0.269668325572915106525464462}, { 0.962953266873683886347921481}, - { 0.490226483288291154229598449}, { 0.871595086655951034842481435}, - {-0.871595086655951034842481435}, { 0.490226483288291154229598449}, - { 0.786455213599085757522319464}, { 0.617647307937803932403979402}, - {-0.617647307937803932403979402}, { 0.786455213599085757522319464}, - { 0.119365214810991364593637790}, { 0.992850414459865090793563344}, - {-0.992850414459865090793563344}, { 0.119365214810991364593637790}, - { 0.984748501801904218556553176}, { 0.173983873387463827950700807}, - {-0.173983873387463827950700807}, { 0.984748501801904218556553176}, - { 0.573297166698042212820171239}, { 0.819347520076796960824689637}, - {-0.819347520076796960824689637}, { 0.573297166698042212820171239}, - { 0.843208239641845437161743865}, { 0.537587076295645482502214932}, - {-0.537587076295645482502214932}, { 0.843208239641845437161743865}, - { 0.216106797076219509948385131}, { 0.976369731330021149312732194}, - {-0.976369731330021149312732194}, { 0.216106797076219509948385131}, - { 0.931884265581668106718557199}, { 0.362755724367397216204854462}, - {-0.362755724367397216204854462}, { 0.931884265581668106718557199}, - { 0.402434650859418441082533934}, { 0.915448716088267819566431292}, - {-0.915448716088267819566431292}, { 0.402434650859418441082533934}, - { 0.722128193929215321243607198}, { 0.691759258364157774906734132}, - {-0.691759258364157774906734132}, { 0.722128193929215321243607198}, - { 0.021474080275469507418374898}, { 0.999769405351215321657617036}, - {-0.999769405351215321657617036}, { 0.021474080275469507418374898}, - { 0.999882347454212525633049627}, { 0.015339206284988101044151868}, - {-0.015339206284988101044151868}, { 0.999882347454212525633049627}, - { 0.696177131491462944788582591}, { 0.717870045055731736211325329}, - {-0.717870045055731736211325329}, { 0.696177131491462944788582591}, - { 0.917900775621390457642276297}, { 0.396809987416710328595290911}, - {-0.396809987416710328595290911}, { 0.917900775621390457642276297}, - { 0.368466829953372331712746222}, { 0.929640895843181265457918066}, - {-0.929640895843181265457918066}, { 0.368466829953372331712746222}, - { 0.977677357824509979943404762}, { 0.210111836880469621717489972}, - {-0.210111836880469621717489972}, { 0.977677357824509979943404762}, - { 0.542750784864515906586768661}, { 0.839893794195999504583383987}, - {-0.839893794195999504583383987}, { 0.542750784864515906586768661}, - { 0.822849781375826332046780034}, { 0.568258952670131549790548489}, - {-0.568258952670131549790548489}, { 0.822849781375826332046780034}, - { 0.180022901405699522679906590}, { 0.983662419211730274396237776}, - {-0.983662419211730274396237776}, { 0.180022901405699522679906590}, - { 0.993564135520595333782021697}, { 0.113270952177564349018228733}, - {-0.113270952177564349018228733}, { 0.993564135520595333782021697}, - { 0.622461279374149972519166721}, { 0.782650596166575738458949301}, - {-0.782650596166575738458949301}, { 0.622461279374149972519166721}, - { 0.874586652278176112634431897}, { 0.484869248000791101822951699}, - {-0.484869248000791101822951699}, { 0.874586652278176112634431897}, - { 0.275571819310958163076425168}, { 0.961280485811320641748659653}, - {-0.961280485811320641748659653}, { 0.275571819310958163076425168}, - { 0.952375012719765858529893608}, { 0.304929229735402406490728633}, - {-0.304929229735402406490728633}, { 0.952375012719765858529893608}, - { 0.457813303598877221904961155}, { 0.889048355854664562540777729}, - {-0.889048355854664562540777729}, { 0.457813303598877221904961155}, - { 0.763188417263381271704838297}, { 0.646176012983316364832802220}, - {-0.646176012983316364832802220}, { 0.763188417263381271704838297}, - { 0.082740264549375693111987083}, { 0.996571145790554847093566910}, - {-0.996571145790554847093566910}, { 0.082740264549375693111987083}, - { 0.997925286198596012623025462}, { 0.064382630929857460819324537}, - {-0.064382630929857460819324537}, { 0.997925286198596012623025462}, - { 0.660114342067420478559490747}, { 0.751165131909686411205819422}, - {-0.751165131909686411205819422}, { 0.660114342067420478559490747}, - { 0.897324580705418281231391836}, { 0.441371268731716692879988968}, - {-0.441371268731716692879988968}, { 0.897324580705418281231391836}, - { 0.322407678801069848384807478}, { 0.946600913083283570044599823}, - {-0.946600913083283570044599823}, { 0.322407678801069848384807478}, - { 0.966190003445412555433832961}, { 0.257831102162159005614471295}, - {-0.257831102162159005614471295}, { 0.966190003445412555433832961}, - { 0.500885382611240786241285004}, { 0.865513624090569082825488358}, - {-0.865513624090569082825488358}, { 0.500885382611240786241285004}, - { 0.793975477554337164895083757}, { 0.607949784967773667243642671}, - {-0.607949784967773667243642671}, { 0.793975477554337164895083757}, - { 0.131540028702883111103387493}, { 0.991310859846115418957349799}, - {-0.991310859846115418957349799}, { 0.131540028702883111103387493}, - { 0.986809401814185476970235952}, { 0.161886393780111837641387995}, - {-0.161886393780111837641387995}, { 0.986809401814185476970235952}, - { 0.583308652937698294392830961}, { 0.812250586585203913049744181}, - {-0.812250586585203913049744181}, { 0.583308652937698294392830961}, - { 0.849741768000852489471268395}, { 0.527199134781901348464274575}, - {-0.527199134781901348464274575}, { 0.849741768000852489471268395}, - { 0.228072083170885739254457379}, { 0.973644249650811925318383912}, - {-0.973644249650811925318383912}, { 0.228072083170885739254457379}, - { 0.936265667170278246576310996}, { 0.351292756085567125601307623}, - {-0.351292756085567125601307623}, { 0.936265667170278246576310996}, - { 0.413638312238434547471944324}, { 0.910441292258067196934095369}, - {-0.910441292258067196934095369}, { 0.413638312238434547471944324}, - { 0.730562769227827561177758850}, { 0.682845546385248068164596123}, - {-0.682845546385248068164596123}, { 0.730562769227827561177758850}, - { 0.033741171851377584833716112}, { 0.999430604555461772019008327}, - {-0.999430604555461772019008327}, { 0.033741171851377584833716112}, - { 0.999204758618363895492950001}, { 0.039872927587739811128578738}, - {-0.039872927587739811128578738}, { 0.999204758618363895492950001}, - { 0.678350043129861486873655042}, { 0.734738878095963464563223604}, - {-0.734738878095963464563223604}, { 0.678350043129861486873655042}, - { 0.907886116487666212038681480}, { 0.419216888363223956433010020}, - {-0.419216888363223956433010020}, { 0.907886116487666212038681480}, - { 0.345541324963989065539191723}, { 0.938403534063108112192420774}, - {-0.938403534063108112192420774}, { 0.345541324963989065539191723}, - { 0.972226497078936305708321144}, { 0.234041958583543423191242045}, - {-0.234041958583543423191242045}, { 0.972226497078936305708321144}, - { 0.521975292937154342694258318}, { 0.852960604930363657746588082}, - {-0.852960604930363657746588082}, { 0.521975292937154342694258318}, - { 0.808656181588174991946968128}, { 0.588281548222645304786439813}, - {-0.588281548222645304786439813}, { 0.808656181588174991946968128}, - { 0.155828397654265235743101486}, { 0.987784141644572154230969032}, - {-0.987784141644572154230969032}, { 0.155828397654265235743101486}, - { 0.990485084256457037998682243}, { 0.137620121586486044948441663}, - {-0.137620121586486044948441663}, { 0.990485084256457037998682243}, - { 0.603066598540348201693430617}, { 0.797690840943391108362662755}, - {-0.797690840943391108362662755}, { 0.603066598540348201693430617}, - { 0.862423956111040538690933878}, { 0.506186645345155291048942344}, - {-0.506186645345155291048942344}, { 0.862423956111040538690933878}, - { 0.251897818154216950498106628}, { 0.967753837093475465243391912}, - {-0.967753837093475465243391912}, { 0.251897818154216950498106628}, - { 0.944604837261480265659265493}, { 0.328209843579092526107916817}, - {-0.328209843579092526107916817}, { 0.944604837261480265659265493}, - { 0.435857079922255491032544080}, { 0.900015892016160228714535267}, - {-0.900015892016160228714535267}, { 0.435857079922255491032544080}, - { 0.747100605980180144323078847}, { 0.664710978203344868130324985}, - {-0.664710978203344868130324985}, { 0.747100605980180144323078847}, - { 0.058258264500435759613979782}, { 0.998301544933892840738782163}, - {-0.998301544933892840738782163}, { 0.058258264500435759613979782}, - { 0.996044700901251989887944810}, { 0.088853552582524596561586535}, - {-0.088853552582524596561586535}, { 0.996044700901251989887944810}, - { 0.641481012808583151988739898}, { 0.767138911935820381181694573}, - {-0.767138911935820381181694573}, { 0.641481012808583151988739898}, - { 0.886222530148880631647990821}, { 0.463259783551860197390719637}, - {-0.463259783551860197390719637}, { 0.886222530148880631647990821}, - { 0.299079826308040476750336973}, { 0.954228095109105629780430732}, - {-0.954228095109105629780430732}, { 0.299079826308040476750336973}, - { 0.959571513081984528335528181}, { 0.281464937925757984095231007}, - {-0.281464937925757984095231007}, { 0.959571513081984528335528181}, - { 0.479493757660153026679839798}, { 0.877545290207261291668470750}, - {-0.877545290207261291668470750}, { 0.479493757660153026679839798}, - { 0.778816512381475953374724325}, { 0.627251815495144113509622565}, - {-0.627251815495144113509622565}, { 0.778816512381475953374724325}, - { 0.107172424956808849175529148}, { 0.994240449453187946358413442}, - {-0.994240449453187946358413442}, { 0.107172424956808849175529148}, - { 0.982539302287441255907040396}, { 0.186055151663446648105438304}, - {-0.186055151663446648105438304}, { 0.982539302287441255907040396}, - { 0.563199344013834115007363772}, { 0.826321062845663480311195452}, - {-0.826321062845663480311195452}, { 0.563199344013834115007363772}, - { 0.836547727223511984524285790}, { 0.547894059173100165608820571}, - {-0.547894059173100165608820571}, { 0.836547727223511984524285790}, - { 0.204108966092816874181696950}, { 0.978948175319062194715480124}, - {-0.978948175319062194715480124}, { 0.204108966092816874181696950}, - { 0.927362525650401087274536959}, { 0.374164062971457997104393020}, - {-0.374164062971457997104393020}, { 0.927362525650401087274536959}, - { 0.391170384302253888687512949}, { 0.920318276709110566440076541}, - {-0.920318276709110566440076541}, { 0.391170384302253888687512949}, - { 0.713584868780793592903125099}, { 0.700568793943248366792866380}, - {-0.700568793943248366792866380}, { 0.713584868780793592903125099}, - { 0.009203754782059819315102378}, { 0.999957644551963866333120920}, - {-0.999957644551963866333120920}, { 0.009203754782059819315102378}, - { 0.999957644551963866333120920}, { 0.009203754782059819315102378}, - {-0.009203754782059819315102378}, { 0.999957644551963866333120920}, - { 0.700568793943248366792866380}, { 0.713584868780793592903125099}, - {-0.713584868780793592903125099}, { 0.700568793943248366792866380}, - { 0.920318276709110566440076541}, { 0.391170384302253888687512949}, - {-0.391170384302253888687512949}, { 0.920318276709110566440076541}, - { 0.374164062971457997104393020}, { 0.927362525650401087274536959}, - {-0.927362525650401087274536959}, { 0.374164062971457997104393020}, - { 0.978948175319062194715480124}, { 0.204108966092816874181696950}, - {-0.204108966092816874181696950}, { 0.978948175319062194715480124}, - { 0.547894059173100165608820571}, { 0.836547727223511984524285790}, - {-0.836547727223511984524285790}, { 0.547894059173100165608820571}, - { 0.826321062845663480311195452}, { 0.563199344013834115007363772}, - {-0.563199344013834115007363772}, { 0.826321062845663480311195452}, - { 0.186055151663446648105438304}, { 0.982539302287441255907040396}, - {-0.982539302287441255907040396}, { 0.186055151663446648105438304}, - { 0.994240449453187946358413442}, { 0.107172424956808849175529148}, - {-0.107172424956808849175529148}, { 0.994240449453187946358413442}, - { 0.627251815495144113509622565}, { 0.778816512381475953374724325}, - {-0.778816512381475953374724325}, { 0.627251815495144113509622565}, - { 0.877545290207261291668470750}, { 0.479493757660153026679839798}, - {-0.479493757660153026679839798}, { 0.877545290207261291668470750}, - { 0.281464937925757984095231007}, { 0.959571513081984528335528181}, - {-0.959571513081984528335528181}, { 0.281464937925757984095231007}, - { 0.954228095109105629780430732}, { 0.299079826308040476750336973}, - {-0.299079826308040476750336973}, { 0.954228095109105629780430732}, - { 0.463259783551860197390719637}, { 0.886222530148880631647990821}, - {-0.886222530148880631647990821}, { 0.463259783551860197390719637}, - { 0.767138911935820381181694573}, { 0.641481012808583151988739898}, - {-0.641481012808583151988739898}, { 0.767138911935820381181694573}, - { 0.088853552582524596561586535}, { 0.996044700901251989887944810}, - {-0.996044700901251989887944810}, { 0.088853552582524596561586535}, - { 0.998301544933892840738782163}, { 0.058258264500435759613979782}, - {-0.058258264500435759613979782}, { 0.998301544933892840738782163}, - { 0.664710978203344868130324985}, { 0.747100605980180144323078847}, - {-0.747100605980180144323078847}, { 0.664710978203344868130324985}, - { 0.900015892016160228714535267}, { 0.435857079922255491032544080}, - {-0.435857079922255491032544080}, { 0.900015892016160228714535267}, - { 0.328209843579092526107916817}, { 0.944604837261480265659265493}, - {-0.944604837261480265659265493}, { 0.328209843579092526107916817}, - { 0.967753837093475465243391912}, { 0.251897818154216950498106628}, - {-0.251897818154216950498106628}, { 0.967753837093475465243391912}, - { 0.506186645345155291048942344}, { 0.862423956111040538690933878}, - {-0.862423956111040538690933878}, { 0.506186645345155291048942344}, - { 0.797690840943391108362662755}, { 0.603066598540348201693430617}, - {-0.603066598540348201693430617}, { 0.797690840943391108362662755}, - { 0.137620121586486044948441663}, { 0.990485084256457037998682243}, - {-0.990485084256457037998682243}, { 0.137620121586486044948441663}, - { 0.987784141644572154230969032}, { 0.155828397654265235743101486}, - {-0.155828397654265235743101486}, { 0.987784141644572154230969032}, - { 0.588281548222645304786439813}, { 0.808656181588174991946968128}, - {-0.808656181588174991946968128}, { 0.588281548222645304786439813}, - { 0.852960604930363657746588082}, { 0.521975292937154342694258318}, - {-0.521975292937154342694258318}, { 0.852960604930363657746588082}, - { 0.234041958583543423191242045}, { 0.972226497078936305708321144}, - {-0.972226497078936305708321144}, { 0.234041958583543423191242045}, - { 0.938403534063108112192420774}, { 0.345541324963989065539191723}, - {-0.345541324963989065539191723}, { 0.938403534063108112192420774}, - { 0.419216888363223956433010020}, { 0.907886116487666212038681480}, - {-0.907886116487666212038681480}, { 0.419216888363223956433010020}, - { 0.734738878095963464563223604}, { 0.678350043129861486873655042}, - {-0.678350043129861486873655042}, { 0.734738878095963464563223604}, - { 0.039872927587739811128578738}, { 0.999204758618363895492950001}, - {-0.999204758618363895492950001}, { 0.039872927587739811128578738}, - { 0.999430604555461772019008327}, { 0.033741171851377584833716112}, - {-0.033741171851377584833716112}, { 0.999430604555461772019008327}, - { 0.682845546385248068164596123}, { 0.730562769227827561177758850}, - {-0.730562769227827561177758850}, { 0.682845546385248068164596123}, - { 0.910441292258067196934095369}, { 0.413638312238434547471944324}, - {-0.413638312238434547471944324}, { 0.910441292258067196934095369}, - { 0.351292756085567125601307623}, { 0.936265667170278246576310996}, - {-0.936265667170278246576310996}, { 0.351292756085567125601307623}, - { 0.973644249650811925318383912}, { 0.228072083170885739254457379}, - {-0.228072083170885739254457379}, { 0.973644249650811925318383912}, - { 0.527199134781901348464274575}, { 0.849741768000852489471268395}, - {-0.849741768000852489471268395}, { 0.527199134781901348464274575}, - { 0.812250586585203913049744181}, { 0.583308652937698294392830961}, - {-0.583308652937698294392830961}, { 0.812250586585203913049744181}, - { 0.161886393780111837641387995}, { 0.986809401814185476970235952}, - {-0.986809401814185476970235952}, { 0.161886393780111837641387995}, - { 0.991310859846115418957349799}, { 0.131540028702883111103387493}, - {-0.131540028702883111103387493}, { 0.991310859846115418957349799}, - { 0.607949784967773667243642671}, { 0.793975477554337164895083757}, - {-0.793975477554337164895083757}, { 0.607949784967773667243642671}, - { 0.865513624090569082825488358}, { 0.500885382611240786241285004}, - {-0.500885382611240786241285004}, { 0.865513624090569082825488358}, - { 0.257831102162159005614471295}, { 0.966190003445412555433832961}, - {-0.966190003445412555433832961}, { 0.257831102162159005614471295}, - { 0.946600913083283570044599823}, { 0.322407678801069848384807478}, - {-0.322407678801069848384807478}, { 0.946600913083283570044599823}, - { 0.441371268731716692879988968}, { 0.897324580705418281231391836}, - {-0.897324580705418281231391836}, { 0.441371268731716692879988968}, - { 0.751165131909686411205819422}, { 0.660114342067420478559490747}, - {-0.660114342067420478559490747}, { 0.751165131909686411205819422}, - { 0.064382630929857460819324537}, { 0.997925286198596012623025462}, - {-0.997925286198596012623025462}, { 0.064382630929857460819324537}, - { 0.996571145790554847093566910}, { 0.082740264549375693111987083}, - {-0.082740264549375693111987083}, { 0.996571145790554847093566910}, - { 0.646176012983316364832802220}, { 0.763188417263381271704838297}, - {-0.763188417263381271704838297}, { 0.646176012983316364832802220}, - { 0.889048355854664562540777729}, { 0.457813303598877221904961155}, - {-0.457813303598877221904961155}, { 0.889048355854664562540777729}, - { 0.304929229735402406490728633}, { 0.952375012719765858529893608}, - {-0.952375012719765858529893608}, { 0.304929229735402406490728633}, - { 0.961280485811320641748659653}, { 0.275571819310958163076425168}, - {-0.275571819310958163076425168}, { 0.961280485811320641748659653}, - { 0.484869248000791101822951699}, { 0.874586652278176112634431897}, - {-0.874586652278176112634431897}, { 0.484869248000791101822951699}, - { 0.782650596166575738458949301}, { 0.622461279374149972519166721}, - {-0.622461279374149972519166721}, { 0.782650596166575738458949301}, - { 0.113270952177564349018228733}, { 0.993564135520595333782021697}, - {-0.993564135520595333782021697}, { 0.113270952177564349018228733}, - { 0.983662419211730274396237776}, { 0.180022901405699522679906590}, - {-0.180022901405699522679906590}, { 0.983662419211730274396237776}, - { 0.568258952670131549790548489}, { 0.822849781375826332046780034}, - {-0.822849781375826332046780034}, { 0.568258952670131549790548489}, - { 0.839893794195999504583383987}, { 0.542750784864515906586768661}, - {-0.542750784864515906586768661}, { 0.839893794195999504583383987}, - { 0.210111836880469621717489972}, { 0.977677357824509979943404762}, - {-0.977677357824509979943404762}, { 0.210111836880469621717489972}, - { 0.929640895843181265457918066}, { 0.368466829953372331712746222}, - {-0.368466829953372331712746222}, { 0.929640895843181265457918066}, - { 0.396809987416710328595290911}, { 0.917900775621390457642276297}, - {-0.917900775621390457642276297}, { 0.396809987416710328595290911}, - { 0.717870045055731736211325329}, { 0.696177131491462944788582591}, - {-0.696177131491462944788582591}, { 0.717870045055731736211325329}, - { 0.015339206284988101044151868}, { 0.999882347454212525633049627}, - {-0.999882347454212525633049627}, { 0.015339206284988101044151868}, - { 0.999769405351215321657617036}, { 0.021474080275469507418374898}, - {-0.021474080275469507418374898}, { 0.999769405351215321657617036}, - { 0.691759258364157774906734132}, { 0.722128193929215321243607198}, - {-0.722128193929215321243607198}, { 0.691759258364157774906734132}, - { 0.915448716088267819566431292}, { 0.402434650859418441082533934}, - {-0.402434650859418441082533934}, { 0.915448716088267819566431292}, - { 0.362755724367397216204854462}, { 0.931884265581668106718557199}, - {-0.931884265581668106718557199}, { 0.362755724367397216204854462}, - { 0.976369731330021149312732194}, { 0.216106797076219509948385131}, - {-0.216106797076219509948385131}, { 0.976369731330021149312732194}, - { 0.537587076295645482502214932}, { 0.843208239641845437161743865}, - {-0.843208239641845437161743865}, { 0.537587076295645482502214932}, - { 0.819347520076796960824689637}, { 0.573297166698042212820171239}, - {-0.573297166698042212820171239}, { 0.819347520076796960824689637}, - { 0.173983873387463827950700807}, { 0.984748501801904218556553176}, - {-0.984748501801904218556553176}, { 0.173983873387463827950700807}, - { 0.992850414459865090793563344}, { 0.119365214810991364593637790}, - {-0.119365214810991364593637790}, { 0.992850414459865090793563344}, - { 0.617647307937803932403979402}, { 0.786455213599085757522319464}, - {-0.786455213599085757522319464}, { 0.617647307937803932403979402}, - { 0.871595086655951034842481435}, { 0.490226483288291154229598449}, - {-0.490226483288291154229598449}, { 0.871595086655951034842481435}, - { 0.269668325572915106525464462}, { 0.962953266873683886347921481}, - {-0.962953266873683886347921481}, { 0.269668325572915106525464462}, - { 0.950486073949481721759926101}, { 0.310767152749611495835997250}, - {-0.310767152749611495835997250}, { 0.950486073949481721759926101}, - { 0.452349587233770874133026703}, { 0.891840709392342727796478697}, - {-0.891840709392342727796478697}, { 0.452349587233770874133026703}, - { 0.759209188978388033485525443}, { 0.650846684996380915068975573}, - {-0.650846684996380915068975573}, { 0.759209188978388033485525443}, - { 0.076623861392031492278332463}, { 0.997060070339482978987989949}, - {-0.997060070339482978987989949}, { 0.076623861392031492278332463}, - { 0.997511456140303459699448390}, { 0.070504573389613863027351471}, - {-0.070504573389613863027351471}, { 0.997511456140303459699448390}, - { 0.655492852999615385312679701}, { 0.755201376896536527598710756}, - {-0.755201376896536527598710756}, { 0.655492852999615385312679701}, - { 0.894599485631382678433072126}, { 0.446868840162374195353044389}, - {-0.446868840162374195353044389}, { 0.894599485631382678433072126}, - { 0.316593375556165867243047035}, { 0.948561349915730288158494826}, - {-0.948561349915730288158494826}, { 0.316593375556165867243047035}, - { 0.964589793289812723836432159}, { 0.263754678974831383611349322}, - {-0.263754678974831383611349322}, { 0.964589793289812723836432159}, - { 0.495565261825772531150266670}, { 0.868570705971340895340449876}, - {-0.868570705971340895340449876}, { 0.495565261825772531150266670}, - { 0.790230221437310055030217152}, { 0.612810082429409703935211936}, - {-0.612810082429409703935211936}, { 0.790230221437310055030217152}, - { 0.125454983411546238542336453}, { 0.992099313142191757112085445}, - {-0.992099313142191757112085445}, { 0.125454983411546238542336453}, - { 0.985797509167567424700995000}, { 0.167938294974731178054745536}, - {-0.167938294974731178054745536}, { 0.985797509167567424700995000}, - { 0.578313796411655563342245019}, { 0.815814410806733789010772660}, - {-0.815814410806733789010772660}, { 0.578313796411655563342245019}, - { 0.846490938774052078300544488}, { 0.532403127877197971442805218}, - {-0.532403127877197971442805218}, { 0.846490938774052078300544488}, - { 0.222093620973203534094094721}, { 0.975025345066994146844913468}, - {-0.975025345066994146844913468}, { 0.222093620973203534094094721}, - { 0.934092550404258914729877883}, { 0.357030961233430032614954036}, - {-0.357030961233430032614954036}, { 0.934092550404258914729877883}, - { 0.408044162864978680820747499}, { 0.912962190428398164628018233}, - {-0.912962190428398164628018233}, { 0.408044162864978680820747499}, - { 0.726359155084345976817494315}, { 0.687315340891759108199186948}, - {-0.687315340891759108199186948}, { 0.726359155084345976817494315}, - { 0.027608145778965741612354872}, { 0.999618822495178597116830637}, - {-0.999618822495178597116830637}, { 0.027608145778965741612354872}, - { 0.998941293186856850633930266}, { 0.046003182130914628814301788}, - {-0.046003182130914628814301788}, { 0.998941293186856850633930266}, - { 0.673829000378756060917568372}, { 0.738887324460615147933116508}, - {-0.738887324460615147933116508}, { 0.673829000378756060917568372}, - { 0.905296759318118774354048329}, { 0.424779681209108833357226189}, - {-0.424779681209108833357226189}, { 0.905296759318118774354048329}, - { 0.339776884406826857828825803}, { 0.940506070593268323787291309}, - {-0.940506070593268323787291309}, { 0.339776884406826857828825803}, - { 0.970772140728950302138169611}, { 0.240003022448741486568922365}, - {-0.240003022448741486568922365}, { 0.970772140728950302138169611}, - { 0.516731799017649881508753876}, { 0.856147328375194481019630732}, - {-0.856147328375194481019630732}, { 0.516731799017649881508753876}, - { 0.805031331142963597922659282}, { 0.593232295039799808047809426}, - {-0.593232295039799808047809426}, { 0.805031331142963597922659282}, - { 0.149764534677321517229695737}, { 0.988721691960323767604516485}, - {-0.988721691960323767604516485}, { 0.149764534677321517229695737}, - { 0.989622017463200834623694454}, { 0.143695033150294454819773349}, - {-0.143695033150294454819773349}, { 0.989622017463200834623694454}, - { 0.598160706996342311724958652}, { 0.801376171723140219430247777}, - {-0.801376171723140219430247777}, { 0.598160706996342311724958652}, - { 0.859301818357008404783582139}, { 0.511468850437970399504391001}, - {-0.511468850437970399504391001}, { 0.859301818357008404783582139}, - { 0.245955050335794611599924709}, { 0.969281235356548486048290738}, - {-0.969281235356548486048290738}, { 0.245955050335794611599924709}, - { 0.942573197601446879280758735}, { 0.333999651442009404650865481}, - {-0.333999651442009404650865481}, { 0.942573197601446879280758735}, - { 0.430326481340082633908199031}, { 0.902673318237258806751502391}, - {-0.902673318237258806751502391}, { 0.430326481340082633908199031}, - { 0.743007952135121693517362293}, { 0.669282588346636065720696366}, - {-0.669282588346636065720696366}, { 0.743007952135121693517362293}, - { 0.052131704680283321236358216}, { 0.998640218180265222418199049}, - {-0.998640218180265222418199049}, { 0.052131704680283321236358216}, - { 0.995480755491926941769171600}, { 0.094963495329638998938034312}, - {-0.094963495329638998938034312}, { 0.995480755491926941769171600}, - { 0.636761861236284230413943435}, { 0.771060524261813773200605759}, - {-0.771060524261813773200605759}, { 0.636761861236284230413943435}, - { 0.883363338665731594736308015}, { 0.468688822035827933697617870}, - {-0.468688822035827933697617870}, { 0.883363338665731594736308015}, - { 0.293219162694258650606608599}, { 0.956045251349996443270479823}, - {-0.956045251349996443270479823}, { 0.293219162694258650606608599}, - { 0.957826413027532890321037029}, { 0.287347459544729526477331841}, - {-0.287347459544729526477331841}, { 0.957826413027532890321037029}, - { 0.474100214650550014398580015}, { 0.880470889052160770806542929}, - {-0.880470889052160770806542929}, { 0.474100214650550014398580015}, - { 0.774953106594873878359129282}, { 0.632018735939809021909403706}, - {-0.632018735939809021909403706}, { 0.774953106594873878359129282}, - { 0.101069862754827824987887585}, { 0.994879330794805620591166107}, - {-0.994879330794805620591166107}, { 0.101069862754827824987887585}, - { 0.981379193313754574318224190}, { 0.192080397049892441679288205}, - {-0.192080397049892441679288205}, { 0.981379193313754574318224190}, - { 0.558118531220556115693702964}, { 0.829761233794523042469023765}, - {-0.829761233794523042469023765}, { 0.558118531220556115693702964}, - { 0.833170164701913186439915922}, { 0.553016705580027531764226988}, - {-0.553016705580027531764226988}, { 0.833170164701913186439915922}, - { 0.198098410717953586179324918}, { 0.980182135968117392690210009}, - {-0.980182135968117392690210009}, { 0.198098410717953586179324918}, - { 0.925049240782677590302371869}, { 0.379847208924051170576281147}, - {-0.379847208924051170576281147}, { 0.925049240782677590302371869}, - { 0.385516053843918864075607949}, { 0.922701128333878570437264227}, - {-0.922701128333878570437264227}, { 0.385516053843918864075607949}, - { 0.709272826438865651316533772}, { 0.704934080375904908852523758}, - {-0.704934080375904908852523758}, { 0.709272826438865651316533772}, - { 0.003067956762965976270145365}, { 0.999995293809576171511580126}, - {-0.999995293809576171511580126}, { 0.003067956762965976270145365} -}; - -const fpr fpr_p2_tab[] = { - { 2.00000000000 }, - { 1.00000000000 }, - { 0.50000000000 }, - { 0.25000000000 }, - { 0.12500000000 }, - { 0.06250000000 }, - { 0.03125000000 }, - { 0.01562500000 }, - { 0.00781250000 }, - { 0.00390625000 }, - { 0.00195312500 } -}; - -#else // yyyFPNATIVE+0 yyyFPEMU+0 - -#error No FP implementation selected - -#endif // yyyFPNATIVE- yyyFPEMU- diff --git a/fpr.h b/fpr.h index c55b956..7a14e9a 100644 --- a/fpr.h +++ b/fpr.h @@ -29,8 +29,6 @@ * @author Thomas Pornin */ -#if FALCON_FPEMU // yyyFPEMU+1 yyyFPNATIVE+0 - /* ====================================================================== */ /* * Custom floating-point implementation with integer arithmetics. We @@ -492,448 +490,3 @@ extern const fpr fpr_gm_tab[]; #define fpr_p2_tab Zf(fpr_p2_tab) extern const fpr fpr_p2_tab[]; - -/* ====================================================================== */ - -#elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1 - -/* ====================================================================== */ - -#include - -/* - * We wrap the native 'double' type into a structure so that the C compiler - * complains if we inadvertently use raw arithmetic operators on the 'fpr' - * type instead of using the inline functions below. This should have no - * extra runtime cost, since all the functions below are 'inline'. - */ -typedef struct { double v; } fpr; - -static inline fpr -FPR(double v) -{ - fpr x; - - x.v = v; - return x; -} - -static inline fpr -fpr_of(int64_t i) -{ - return FPR((double)i); -} - -static const fpr fpr_q = { 12289.0 }; -static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 }; -static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 }; -static const fpr fpr_inv_sigma[] = { - { 0.0 }, /* unused */ - { 0.0069054793295940891952143765991630516 }, - { 0.0068102267767177975961393730687908629 }, - { 0.0067188101910722710707826117910434131 }, - { 0.0065883354370073665545865037227681924 }, - { 0.0064651781207602900738053897763485516 }, - { 0.0063486788828078995327741182928037856 }, - { 0.0062382586529084374473367528433697537 }, - { 0.0061334065020930261548984001431770281 }, - { 0.0060336696681577241031668062510953022 }, - { 0.0059386453095331159950250124336477482 } -}; -static const fpr fpr_sigma_min[] = { - { 0.0 }, /* unused */ - { 1.1165085072329102588881898380334015 }, - { 1.1321247692325272405718031785357108 }, - { 1.1475285353733668684571123112513188 }, - { 1.1702540788534828939713084716509250 }, - { 1.1925466358390344011122170489094133 }, - { 1.2144300507766139921088487776957699 }, - { 1.2359260567719808790104525941706723 }, - { 1.2570545284063214162779743112075080 }, - { 1.2778336969128335860256340575729042 }, - { 1.2982803343442918539708792538826807 } -}; -static const fpr fpr_log2 = { 0.69314718055994530941723212146 }; -static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 }; -static const fpr fpr_bnorm_max = { 16822.4121 }; -static const fpr fpr_zero = { 0.0 }; -static const fpr fpr_one = { 1.0 }; -static const fpr fpr_two = { 2.0 }; -static const fpr fpr_onehalf = { 0.5 }; -static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 }; -static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 }; -static const fpr fpr_ptwo31 = { 2147483648.0 }; -static const fpr fpr_ptwo31m1 = { 2147483647.0 }; -static const fpr fpr_mtwo31m1 = { -2147483647.0 }; -static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 }; -static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 }; -static const fpr fpr_ptwo63 = { 9223372036854775808.0 }; - -static inline int64_t -fpr_rint(fpr x) -{ - /* - * We do not want to use llrint() since it might be not - * constant-time. - * - * Suppose that x >= 0. If x >= 2^52, then it is already an - * integer. Otherwise, if x < 2^52, then computing x+2^52 will - * yield a value that will be rounded to the nearest integer - * with exactly the right rules (round-to-nearest-even). - * - * In order to have constant-time processing, we must do the - * computation for both x >= 0 and x < 0 cases, and use a - * cast to an integer to access the sign and select the proper - * value. Such casts also allow us to find out if |x| < 2^52. - */ - int64_t sx, tx, rp, rn, m; - uint32_t ub; - - sx = (int64_t)(x.v - 1.0); - tx = (int64_t)x.v; - rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496; - rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496; - - /* - * If tx >= 2^52 or tx < -2^52, then result is tx. - * Otherwise, if sx >= 0, then result is rp. - * Otherwise, result is rn. We use the fact that when x is - * close to 0 (|x| <= 0.25) then both rp and rn are correct; - * and if x is not close to 0, then trunc(x-1.0) yields the - * appropriate sign. - */ - - /* - * Clamp rp to zero if tx < 0. - * Clamp rn to zero if tx >= 0. - */ - m = sx >> 63; - rn &= m; - rp &= ~m; - - /* - * Get the 12 upper bits of tx; if they are not all zeros or - * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both - * rp and rn to zero. Otherwise, we clamp tx to zero. - */ - ub = (uint32_t)((uint64_t)tx >> 52); - m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31); - rp &= m; - rn &= m; - tx &= ~m; - - /* - * Only one of tx, rn or rp (at most) can be non-zero at this - * point. - */ - return tx | rn | rp; -} - -static inline int64_t -fpr_floor(fpr x) -{ - int64_t r; - - /* - * The cast performs a trunc() (rounding toward 0) and thus is - * wrong by 1 for most negative values. The correction below is - * constant-time as long as the compiler turns the - * floating-point conversion result into a 0/1 integer without a - * conditional branch or another non-constant-time construction. - * This should hold on all modern architectures with an FPU (and - * if it is false on a given arch, then chances are that the FPU - * itself is not constant-time, making the point moot). - */ - r = (int64_t)x.v; - return r - (x.v < (double)r); -} - -static inline int64_t -fpr_trunc(fpr x) -{ - return (int64_t)x.v; -} - -static inline fpr -fpr_add(fpr x, fpr y) -{ - return FPR(x.v + y.v); -} - -static inline fpr -fpr_sub(fpr x, fpr y) -{ - return FPR(x.v - y.v); -} - -static inline fpr -fpr_neg(fpr x) -{ - return FPR(-x.v); -} - -static inline fpr -fpr_half(fpr x) -{ - return FPR(x.v * 0.5); -} - -static inline fpr -fpr_double(fpr x) -{ - return FPR(x.v + x.v); -} - -static inline fpr -fpr_mul(fpr x, fpr y) -{ - return FPR(x.v * y.v); -} - -static inline fpr -fpr_sqr(fpr x) -{ - return FPR(x.v * x.v); -} - -static inline fpr -fpr_inv(fpr x) -{ - return FPR(1.0 / x.v); -} - -static inline fpr -fpr_div(fpr x, fpr y) -{ - return FPR(x.v / y.v); -} - -#if FALCON_AVX2 // yyyAVX2+1 -TARGET_AVX2 -static inline void -fpr_sqrt_avx2(double *t) -{ - __m128d x; - - x = _mm_load1_pd(t); - x = _mm_sqrt_pd(x); - _mm_storel_pd(t, x); -} -#endif // yyyAVX2- - -static inline fpr -fpr_sqrt(fpr x) -{ - /* - * We prefer not to have a dependency on libm when it can be - * avoided. On x86, calling the sqrt() libm function inlines - * the relevant opcode (fsqrt or sqrtsd, depending on whether - * the 387 FPU or SSE2 is used for floating-point operations) - * but then makes an optional call to the library function - * for proper error handling, in case the operand is negative. - * - * To avoid this dependency, we use intrinsics or inline assembly - * on recognized platforms: - * - * - If AVX2 is explicitly enabled, then we use SSE2 intrinsics. - * - * - On GCC/Clang with SSE maths, we use SSE2 intrinsics. - * - * - On GCC/Clang on i386, or MSVC on i386, we use inline assembly - * to call the 387 FPU fsqrt opcode. - * - * - On GCC/Clang/XLC on PowerPC, we use inline assembly to call - * the fsqrt opcode (Clang needs a special hack). - * - * - On GCC/Clang on ARM with hardware floating-point, we use - * inline assembly to call the vqsrt.f64 opcode. Due to a - * complex ecosystem of compilers and assembly syntaxes, we - * have to call it "fsqrt" or "fsqrtd", depending on case. - * - * If the platform is not recognized, a call to the system - * library function sqrt() is performed. On some compilers, this - * may actually inline the relevant opcode, and call the library - * function only when the input is invalid (e.g. negative); - * Falcon never actually calls sqrt() on a negative value, but - * the dependency to libm will still be there. - */ - -#if FALCON_AVX2 // yyyAVX2+1 - fpr_sqrt_avx2(&x.v); - return x; -#else // yyyAVX2+0 -#if defined __GNUC__ && defined __SSE2_MATH__ - return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v)))); -#elif defined __GNUC__ && defined __i386__ - __asm__ __volatile__ ( - "fldl %0\n\t" - "fsqrt\n\t" - "fstpl %0\n\t" - : "+m" (x.v) : : ); - return x; -#elif defined _M_IX86 - __asm { - fld x.v - fsqrt - fstp x.v - } - return x; -#elif defined __PPC__ && defined __GNUC__ - fpr y; - -#if defined __clang__ - /* - * Normally we should use a 'd' constraint (register that contains - * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use - * an 'f' constraint, counting on the fact that 'float' values - * are managed in double-precision registers anyway, and the - * compiler will not add extra rounding steps. - */ - __asm__ ( "fsqrt %0, %1" : "=f" (y.v) : "f" (x.v) : ); -#else - __asm__ ( "fsqrt %0, %1" : "=d" (y.v) : "d" (x.v) : ); -#endif - return y; -#elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \ - || (!defined __ARM_FP && defined __ARM_VFPV2__) - /* - * On ARM, assembly syntaxes are a bit of a mess, depending on - * whether GCC or Clang is used, and the binutils version, and - * whether this is 32-bit or 64-bit mode. The code below appears - * to work on: - * 32-bit GCC-4.9.2 Clang-3.5 Binutils-2.25 - * 64-bit GCC-6.3.0 Clang-3.9 Binutils-2.28 - */ -#if defined __aarch64__ && __aarch64__ - __asm__ ( "fsqrt %d0, %d0" : "+w" (x.v) : : ); -#else - __asm__ ( "fsqrtd %P0, %P0" : "+w" (x.v) : : ); -#endif - return x; -#else - return FPR(sqrt(x.v)); -#endif -#endif // yyyAVX2- -} - -static inline int -fpr_lt(fpr x, fpr y) -{ - return x.v < y.v; -} - -TARGET_AVX2 -static inline uint64_t -fpr_expm_p63(fpr x, fpr ccs) -{ - /* - * Polynomial approximation of exp(-x) is taken from FACCT: - * https://eprint.iacr.org/2018/1234 - * Specifically, values are extracted from the implementation - * referenced from the FACCT article, and available at: - * https://github.com/raykzhao/gaussian - * Tests over more than 24 billions of random inputs in the - * 0..log(2) range have never shown a deviation larger than - * 2^(-50) from the true mathematical value. - */ - -#if FALCON_AVX2 // yyyAVX2+1 - - /* - * AVX2 implementation uses more operations than Horner's method, - * but with a lower expression tree depth. This helps because - * additions and multiplications have a latency of 4 cycles on - * a Skylake, but the CPU can issue two of them per cycle. - */ - - static const union { - double d[12]; - __m256d v[3]; - } c = { - { - 0.999999999999994892974086724280, - 0.500000000000019206858326015208, - 0.166666666666984014666397229121, - 0.041666666666110491190622155955, - 0.008333333327800835146903501993, - 0.001388888894063186997887560103, - 0.000198412739277311890541063977, - 0.000024801566833585381209939524, - 0.000002755586350219122514855659, - 0.000000275607356160477811864927, - 0.000000025299506379442070029551, - 0.000000002073772366009083061987 - } - }; - - double d1, d2, d4, d8, y; - __m256d d14, d58, d9c; - - d1 = -x.v; - d2 = d1 * d1; - d4 = d2 * d2; - d8 = d4 * d4; - d14 = _mm256_set_pd(d4, d2 * d1, d2, d1); - d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4)); - d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8)); - d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0])); - d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14); - d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58); - d9c = _mm256_hadd_pd(d9c, d9c); - y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c) - + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1)); - y *= ccs.v; - - /* - * Final conversion goes through int64_t first, because that's what - * the underlying opcode (vcvttsd2si) will do, and we know that the - * result will fit, since x >= 0 and ccs < 1. If we did the - * conversion directly to uint64_t, then the compiler would add some - * extra code to cover the case of a source value of 2^63 or more, - * and though the alternate path would never be exercised, the - * extra comparison would cost us some cycles. - */ - return (uint64_t)(int64_t)(y * fpr_ptwo63.v); - -#else // yyyAVX2+0 - - /* - * Normal implementation uses Horner's method, which minimizes - * the number of operations. - */ - - double d, y; - - d = x.v; - y = 0.000000002073772366009083061987; - y = 0.000000025299506379442070029551 - y * d; - y = 0.000000275607356160477811864927 - y * d; - y = 0.000002755586350219122514855659 - y * d; - y = 0.000024801566833585381209939524 - y * d; - y = 0.000198412739277311890541063977 - y * d; - y = 0.001388888894063186997887560103 - y * d; - y = 0.008333333327800835146903501993 - y * d; - y = 0.041666666666110491190622155955 - y * d; - y = 0.166666666666984014666397229121 - y * d; - y = 0.500000000000019206858326015208 - y * d; - y = 0.999999999999994892974086724280 - y * d; - y = 1.000000000000000000000000000000 - y * d; - y *= ccs.v; - return (uint64_t)(y * fpr_ptwo63.v); - -#endif // yyyAVX2- -} - -#define fpr_gm_tab Zf(fpr_gm_tab) -extern const fpr fpr_gm_tab[]; - -#define fpr_p2_tab Zf(fpr_p2_tab) -extern const fpr fpr_p2_tab[]; - -/* ====================================================================== */ - -#else // yyyFPEMU+0 yyyFPNATIVE+0 - -#error No FP implementation selected - -#endif // yyyFPEMU- yyyFPNATIVE- diff --git a/inner.h b/inner.h index 88a8448..b1adc98 100644 --- a/inner.h +++ b/inner.h @@ -73,46 +73,10 @@ * function does nothing, so it can be called systematically. */ -// yyyPQCLEAN+0 yyyNIST+0 yyySUPERCOP+0 -#include "config.h" -// yyyPQCLEAN- yyyNIST- yyySUPERCOP- -// yyySUPERCOP+1 -// yyyCONF* -// yyySUPERCOP- - #include #include #include -#if defined FALCON_AVX2 && FALCON_AVX2 // yyyAVX2+1 -/* - * This implementation uses AVX2 and optionally FMA intrinsics. - */ -#include -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 1 -#endif -#if defined __GNUC__ -#if defined FALCON_FMA && FALCON_FMA -#define TARGET_AVX2 __attribute__((target("avx2,fma"))) -#else -#define TARGET_AVX2 __attribute__((target("avx2"))) -#endif -#elif defined _MSC_VER && _MSC_VER -#pragma warning( disable : 4752 ) -#endif -#if defined FALCON_FMA && FALCON_FMA -#define FMADD(a, b, c) _mm256_fmadd_pd(a, b, c) -#define FMSUB(a, b, c) _mm256_fmsub_pd(a, b, c) -#else -#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c) -#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c) -#endif -#endif // yyyAVX2- - // yyyNIST+0 yyyPQCLEAN+0 /* * On MSVC, disable warning about applying unary minus on an unsigned @@ -123,20 +87,6 @@ #pragma warning( disable : 4146 ) #endif -// yyySUPERCOP+0 -/* - * Enable ARM assembly on any ARMv7m platform (if it was not done before). - */ -#ifndef FALCON_ASM_CORTEXM4 -#if (defined __ARM_ARCH_7EM__ && __ARM_ARCH_7EM__) \ - && (defined __ARM_FEATURE_DSP && __ARM_FEATURE_DSP) -#define FALCON_ASM_CORTEXM4 1 -#else -#define FALCON_ASM_CORTEXM4 0 -#endif -#endif -// yyySUPERCOP- - #if defined __i386__ || defined _M_IX86 \ || defined __x86_64__ || defined _M_X64 || \ (defined _ARCH_PWR8 && \ @@ -149,15 +99,6 @@ #define FALCON_UNALIGNED 1 #endif -#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4 - -#ifndef FALCON_LE -#define FALCON_LE 1 -#endif -#ifndef FALCON_UNALIGNED -#define FALCON_UNALIGNED 0 -#endif - #elif (defined __LITTLE_ENDIAN__ && __LITTLE_ENDIAN__) \ || (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \ && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) @@ -180,48 +121,6 @@ #endif -/* - * We ensure that both FALCON_FPEMU and FALCON_FPNATIVE are defined, - * with compatible values (exactly one of them must be non-zero). - * If none is defined, then default FP implementation is 'native' - * except on ARM Cortex M4. - */ -#if !defined FALCON_FPEMU && !defined FALCON_FPNATIVE - -#if (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \ - || (!defined __ARM_FP && defined __ARM_VFPV2__) -#define FALCON_FPEMU 0 -#define FALCON_FPNATIVE 1 -#elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4 -#define FALCON_FPEMU 1 -#define FALCON_FPNATIVE 0 -#else -#define FALCON_FPEMU 0 -#define FALCON_FPNATIVE 1 -#endif - -#elif defined FALCON_FPEMU && !defined FALCON_FPNATIVE - -#if FALCON_FPEMU -#define FALCON_FPNATIVE 0 -#else -#define FALCON_FPNATIVE 1 -#endif - -#elif defined FALCON_FPNATIVE && !defined FALCON_FPEMU - -#if FALCON_FPNATIVE -#define FALCON_FPEMU 0 -#else -#define FALCON_FPEMU 1 -#endif - -#endif - -#if (FALCON_FPEMU && FALCON_FPNATIVE) || (!FALCON_FPEMU && !FALCON_FPNATIVE) -#error Exactly one of FALCON_FPEMU and FALCON_FPNATIVE must be selected -#endif - // yyySUPERCOP+0 /* * For seed generation from the operating system: @@ -271,12 +170,6 @@ * For still undefined compile-time macros, define them to 0 to avoid * warnings with -Wundef. */ -#ifndef FALCON_AVX2 -#define FALCON_AVX2 0 -#endif -#ifndef FALCON_FMA -#define FALCON_FMA 0 -#endif #ifndef FALCON_KG_CHACHA20 #define FALCON_KG_CHACHA20 0 #endif @@ -295,18 +188,6 @@ #define Zf__(prefix, name) prefix ## _ ## name // yyyPQCLEAN- yyySUPERCOP- -// yyyAVX2+1 -/* - * We use the TARGET_AVX2 macro to tag some functions which, in some - * configurations, may use AVX2 and FMA intrinsics; this depends on - * the compiler. In all other cases, we just define it to emptiness - * (i.e. it will have no effect). - */ -#ifndef TARGET_AVX2 -#define TARGET_AVX2 -#endif -// yyyAVX2- - /* * Some computations with floating-point elements, in particular * rounding to the nearest integer, rely on operations using _exactly_ @@ -323,74 +204,11 @@ * targets other than 32-bit x86, or when the native 'double' type is * not used, the set_fpu_cw() function does nothing at all. */ -#if FALCON_FPNATIVE // yyyFPNATIVE+1 -#if defined __GNUC__ && defined __i386__ -static inline unsigned -set_fpu_cw(unsigned x) -{ - unsigned short t; - unsigned old; - - __asm__ __volatile__ ("fstcw %0" : "=m" (t) : : ); - old = (t & 0x0300u) >> 8; - t = (unsigned short)((t & ~0x0300u) | (x << 8)); - __asm__ __volatile__ ("fldcw %0" : : "m" (t) : ); - return old; -} -#elif defined _M_IX86 -static inline unsigned -set_fpu_cw(unsigned x) -{ - unsigned short t; - unsigned old; - - __asm { fstcw t } - old = (t & 0x0300u) >> 8; - t = (unsigned short)((t & ~0x0300u) | (x << 8)); - __asm { fldcw t } - return old; -} -#else static inline unsigned set_fpu_cw(unsigned x) { return x; } -#endif -#else // yyyFPNATIVE+0 -static inline unsigned -set_fpu_cw(unsigned x) -{ - return x; -} -#endif // yyyFPNATIVE- - -#if FALCON_FPNATIVE && !FALCON_AVX2 // yyyFPNATIVE+1 yyyAVX2+0 -/* - * If using the native 'double' type but not AVX2 code, on an x86 - * machine with SSE2 activated for maths, then we will use the - * SSE2 intrinsics. - */ -#if defined __GNUC__ && defined __SSE2_MATH__ -#include -#endif -#endif // yyyFPNATIVE- yyyAVX2- - -#if FALCON_FPNATIVE // yyyFPNATIVE+1 -/* - * For optimal reproducibility of values, we need to disable contraction - * of floating-point expressions; otherwise, on some architectures (e.g. - * PowerPC), the compiler may generate fused-multiply-add opcodes that - * may round differently than two successive separate opcodes. C99 defines - * a standard pragma for that, but GCC-6.2.2 appears to ignore it, - * hence the GCC-specific pragma (that Clang does not support). - */ -#if defined __clang__ -#pragma STDC FP_CONTRACT OFF -#elif defined __GNUC__ -#pragma GCC optimize ("fp-contract=off") -#endif -#endif // yyyFPNATIVE- // yyyPQCLEAN+0 /* @@ -1155,10 +973,8 @@ typedef struct { fpr sigma_min; } sampler_context; -TARGET_AVX2 int Zf(sampler)(void *ctx, fpr mu, fpr isigma); -TARGET_AVX2 int Zf(gaussian0_sampler)(prng *p); /* ==================================================================== */ diff --git a/rng.c b/rng.c index d2ecb7a..a812a11 100644 --- a/rng.c +++ b/rng.c @@ -162,109 +162,9 @@ Zf(prng_init)(prng *p, inner_shake256_context *src) * * The block counter is XORed into the first 8 bytes of the IV. */ -TARGET_AVX2 void Zf(prng_refill)(prng *p) { -#if FALCON_AVX2 // yyyAVX2+1 - - static const uint32_t CW[] = { - 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 - }; - - uint64_t cc; - size_t u; - int i; - uint32_t *sw; - union { - uint32_t w[16]; - __m256i y[2]; /* for alignment */ - } t; - __m256i state[16], init[16]; - - sw = (uint32_t *)p->state.d; - - /* - * XOR next counter values into state. - */ - cc = *(uint64_t *)(p->state.d + 48); - for (u = 0; u < 8; u ++) { - t.w[u] = (uint32_t)(cc + u); - t.w[u + 8] = (uint32_t)((cc + u) >> 32); - } - *(uint64_t *)(p->state.d + 48) = cc + 8; - - /* - * Load state. - */ - for (u = 0; u < 4; u ++) { - state[u] = init[u] = - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u])); - } - for (u = 0; u < 10; u ++) { - state[u + 4] = init[u + 4] = - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u])); - } - state[14] = init[14] = _mm256_xor_si256( - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])), - _mm256_loadu_si256((__m256i *)&t.w[0])); - state[15] = init[15] = _mm256_xor_si256( - _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])), - _mm256_loadu_si256((__m256i *)&t.w[8])); - - /* - * Do all rounds. - */ - for (i = 0; i < 10; i ++) { - -#define QROUND(a, b, c, d) do { \ - state[a] = _mm256_add_epi32(state[a], state[b]); \ - state[d] = _mm256_xor_si256(state[d], state[a]); \ - state[d] = _mm256_or_si256( \ - _mm256_slli_epi32(state[d], 16), \ - _mm256_srli_epi32(state[d], 16)); \ - state[c] = _mm256_add_epi32(state[c], state[d]); \ - state[b] = _mm256_xor_si256(state[b], state[c]); \ - state[b] = _mm256_or_si256( \ - _mm256_slli_epi32(state[b], 12), \ - _mm256_srli_epi32(state[b], 20)); \ - state[a] = _mm256_add_epi32(state[a], state[b]); \ - state[d] = _mm256_xor_si256(state[d], state[a]); \ - state[d] = _mm256_or_si256( \ - _mm256_slli_epi32(state[d], 8), \ - _mm256_srli_epi32(state[d], 24)); \ - state[c] = _mm256_add_epi32(state[c], state[d]); \ - state[b] = _mm256_xor_si256(state[b], state[c]); \ - state[b] = _mm256_or_si256( \ - _mm256_slli_epi32(state[b], 7), \ - _mm256_srli_epi32(state[b], 25)); \ - } while (0) - - QROUND( 0, 4, 8, 12); - QROUND( 1, 5, 9, 13); - QROUND( 2, 6, 10, 14); - QROUND( 3, 7, 11, 15); - QROUND( 0, 5, 10, 15); - QROUND( 1, 6, 11, 12); - QROUND( 2, 7, 8, 13); - QROUND( 3, 4, 9, 14); - -#undef QROUND - - } - - /* - * Add initial state back and encode the result in the destination - * buffer. We can dump the AVX2 values "as is" because the non-AVX2 - * code uses a compatible order of values. - */ - for (u = 0; u < 16; u ++) { - _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5], - _mm256_add_epi32(state[u], init[u])); - } - -#else // yyyAVX2+0 - static const uint32_t CW[] = { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 }; @@ -349,8 +249,6 @@ Zf(prng_refill)(prng *p) } *(uint64_t *)(p->state.d + 48) = cc; -#endif // yyyAVX2- - p->ptr = 0; } diff --git a/shake.c b/shake.c index cfddc81..7a820cf 100644 --- a/shake.c +++ b/shake.c @@ -33,593 +33,6 @@ #include "inner.h" -#if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1 - -__attribute__((naked)) -static void -process_block(uint64_t *A __attribute__((unused))) -{ - __asm__ ( - "push { r1, r2, r3, r4, r5, r6, r7, r8, r10, r11, r12, lr }\n\t" - "sub sp, sp, #232\n\t" - "\n\t" - "@ Invert some words (alternate internal representation, which\n\t" - "@ saves some operations).\n\t" - "\n\t" - -#define INVERT_WORDS \ - "@ Invert A[1] and A[2].\n\t" \ - "adds r1, r0, #8\n\t" \ - "ldm r1, { r2, r3, r4, r5 }\n\t" \ - "mvns r2, r2\n\t" \ - "mvns r3, r3\n\t" \ - "mvns r4, r4\n\t" \ - "mvns r5, r5\n\t" \ - "stm r1!, { r2, r3, r4, r5 }\n\t" \ - "@ Invert A[8]\n\t" \ - "adds r1, r0, #64\n\t" \ - "ldm r1, { r2, r3 }\n\t" \ - "mvns r2, r2\n\t" \ - "mvns r3, r3\n\t" \ - "stm r1!, { r2, r3 }\n\t" \ - "@ Invert A[12]\n\t" \ - "adds r1, r0, #96\n\t" \ - "ldm r1, { r2, r3 }\n\t" \ - "mvns r2, r2\n\t" \ - "mvns r3, r3\n\t" \ - "stm r1!, { r2, r3 }\n\t" \ - "@ Invert A[17]\n\t" \ - "adds r1, r0, #136\n\t" \ - "ldm r1, { r2, r3 }\n\t" \ - "mvns r2, r2\n\t" \ - "mvns r3, r3\n\t" \ - "stm r1!, { r2, r3 }\n\t" \ - "@ Invert A[20]\n\t" \ - "adds r1, r0, #160\n\t" \ - "ldm r1, { r2, r3 }\n\t" \ - "mvns r2, r2\n\t" \ - "mvns r3, r3\n\t" \ - "stm r1!, { r2, r3 }\n\t" \ - "\n\t" - - INVERT_WORDS - - "@ Do 24 rounds. Each loop iteration performs one rounds. We\n\t" - "@ keep eight times the current round counter in [sp] (i.e.\n\t" - "@ a multiple of 8, from 0 to 184).\n\t" - "\n\t" - "eors r1, r1\n\t" - "str r1, [sp, #0]\n\t" -".process_block_loop:\n\t" - "\n\t" - "@ xor(A[5*i+0]) -> r1:r2\n\t" - "@ xor(A[5*i+1]) -> r3:r4\n\t" - "@ xor(A[5*i+2]) -> r5:r6\n\t" - "@ xor(A[5*i+3]) -> r7:r8\n\t" - "@ xor(A[5*i+4]) -> r10:r11\n\t" - "ldm r0!, { r1, r2, r3, r4, r5, r6, r7, r8 }\n\t" - "adds r0, #8\n\t" - "ldm r0!, { r10, r11, r12 }\n\t" - "eors r1, r10\n\t" - "eors r2, r11\n\t" - "eors r3, r12\n\t" - "ldm r0!, { r10, r11, r12 }\n\t" - "eors r4, r10\n\t" - "eors r5, r11\n\t" - "eors r6, r12\n\t" - "ldm r0!, { r10, r11 }\n\t" - "eors r7, r10\n\t" - "eors r8, r11\n\t" - "adds r0, #8\n\t" - "ldm r0!, { r10, r11, r12 }\n\t" - "eors r1, r10\n\t" - "eors r2, r11\n\t" - "eors r3, r12\n\t" - "ldm r0!, { r10, r11, r12 }\n\t" - "eors r4, r10\n\t" - "eors r5, r11\n\t" - "eors r6, r12\n\t" - "ldm r0!, { r10, r11 }\n\t" - "eors r7, r10\n\t" - "eors r8, r11\n\t" - "adds r0, #8\n\t" - "ldm r0!, { r10, r11, r12 }\n\t" - "eors r1, r10\n\t" - "eors r2, r11\n\t" - "eors r3, r12\n\t" - "ldm r0!, { r10, r11, r12 }\n\t" - "eors r4, r10\n\t" - "eors r5, r11\n\t" - "eors r6, r12\n\t" - "ldm r0!, { r10, r11 }\n\t" - "eors r7, r10\n\t" - "eors r8, r11\n\t" - "adds r0, #8\n\t" - "ldm r0!, { r10, r11, r12 }\n\t" - "eors r1, r10\n\t" - "eors r2, r11\n\t" - "eors r3, r12\n\t" - "ldm r0!, { r10, r11, r12 }\n\t" - "eors r4, r10\n\t" - "eors r5, r11\n\t" - "eors r6, r12\n\t" - "ldm r0!, { r10, r11 }\n\t" - "eors r7, r10\n\t" - "eors r8, r11\n\t" - "ldm r0!, { r10, r11 }\n\t" - "subs r0, #200\n\t" - "ldr r12, [r0, #32]\n\t" - "eors r10, r12\n\t" - "ldr r12, [r0, #36]\n\t" - "eors r11, r12\n\t" - "ldr r12, [r0, #72]\n\t" - "eors r10, r12\n\t" - "ldr r12, [r0, #76]\n\t" - "eors r11, r12\n\t" - "ldr r12, [r0, #112]\n\t" - "eors r10, r12\n\t" - "ldr r12, [r0, #116]\n\t" - "eors r11, r12\n\t" - "ldr r12, [r0, #152]\n\t" - "eors r10, r12\n\t" - "ldr r12, [r0, #156]\n\t" - "eors r11, r12\n\t" - "\n\t" - "@ t0 = xor(A[5*i+4]) ^ rotl1(xor(A[5*i+1])) -> r10:r11\n\t" - "@ t1 = xor(A[5*i+0]) ^ rotl1(xor(A[5*i+2])) -> r1:r2\n\t" - "@ t2 = xor(A[5*i+1]) ^ rotl1(xor(A[5*i+3])) -> r3:r4\n\t" - "@ t3 = xor(A[5*i+2]) ^ rotl1(xor(A[5*i+4])) -> r5:r6\n\t" - "@ t4 = xor(A[5*i+3]) ^ rotl1(xor(A[5*i+0])) -> r7:r8\n\t" - "str r11, [sp, #4]\n\t" - "mov r12, r10\n\t" - "eors r10, r10, r3, lsl #1\n\t" - "eors r10, r10, r4, lsr #31\n\t" - "eors r11, r11, r4, lsl #1\n\t" - "eors r11, r11, r3, lsr #31\n\t" - "eors r3, r3, r7, lsl #1\n\t" - "eors r3, r3, r8, lsr #31\n\t" - "eors r4, r4, r8, lsl #1\n\t" - "eors r4, r4, r7, lsr #31\n\t" - "eors r7, r7, r1, lsl #1\n\t" - "eors r7, r7, r2, lsr #31\n\t" - "eors r8, r8, r2, lsl #1\n\t" - "eors r8, r8, r1, lsr #31\n\t" - "eors r1, r1, r5, lsl #1\n\t" - "eors r1, r1, r6, lsr #31\n\t" - "eors r2, r2, r6, lsl #1\n\t" - "eors r2, r2, r5, lsr #31\n\t" - "eors r5, r5, r12, lsl #1\n\t" - "eors r6, r6, r12, lsr #31\n\t" - "ldr r12, [sp, #4]\n\t" - "eors r5, r5, r12, lsr #31\n\t" - "eors r6, r6, r12, lsl #1\n\t" - "\n\t" - "@ Save t2, t3 and t4 on the stack.\n\t" - "addw r12, sp, #4\n\t" - "stm r12, { r3, r4, r5, r6, r7, r8 }\n\t" - "\n\t" - "@ We XOR one of the t0..t4 values into each A[] word, and\n\t" - "@ rotate the result by some amount (each word has its own\n\t" - "@ amount). The results are written back into a stack buffer\n\t" - "@ that starts at sp+32\n\t" - "addw r12, sp, #32\n\t" - "\n\t" - "@ XOR t0 into A[5*i+0] and t1 into A[5*i+1]; each A[i] is also\n\t" - "@ rotated left by some amount.\n\t" - "\n\t" - "@ A[0] and A[1]\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r5, r10\n\t" - "eors r6, r11\n\t" - "eors r3, r7, r1\n\t" - "eors r4, r8, r2\n\t" - "lsl r7, r3, #1\n\t" - "orr r7, r7, r4, lsr #31\n\t" - "lsl r8, r4, #1\n\t" - "orr r8, r8, r3, lsr #31\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ A[5] and A[6]\n\t" - "adds r0, #24\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r3, r5, r10\n\t" - "eors r4, r6, r11\n\t" - "lsl r5, r4, #4\n\t" - "orr r5, r5, r3, lsr #28\n\t" - "lsl r6, r3, #4\n\t" - "orr r6, r6, r4, lsr #28\n\t" - "eors r3, r7, r1\n\t" - "eors r4, r8, r2\n\t" - "lsl r7, r4, #12\n\t" - "orr r7, r7, r3, lsr #20\n\t" - "lsl r8, r3, #12\n\t" - "orr r8, r8, r4, lsr #20\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ A[10] and A[11]\n\t" - "adds r0, #24\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r3, r5, r10\n\t" - "eors r4, r6, r11\n\t" - "lsl r5, r3, #3\n\t" - "orr r5, r5, r4, lsr #29\n\t" - "lsl r6, r4, #3\n\t" - "orr r6, r6, r3, lsr #29\n\t" - "eors r3, r7, r1\n\t" - "eors r4, r8, r2\n\t" - "lsl r7, r3, #10\n\t" - "orr r7, r7, r4, lsr #22\n\t" - "lsl r8, r4, #10\n\t" - "orr r8, r8, r3, lsr #22\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ A[15] and A[16]\n\t" - "adds r0, #24\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r3, r5, r10\n\t" - "eors r4, r6, r11\n\t" - "lsl r5, r4, #9\n\t" - "orr r5, r5, r3, lsr #23\n\t" - "lsl r6, r3, #9\n\t" - "orr r6, r6, r4, lsr #23\n\t" - "eors r3, r7, r1\n\t" - "eors r4, r8, r2\n\t" - "lsl r7, r4, #13\n\t" - "orr r7, r7, r3, lsr #19\n\t" - "lsl r8, r3, #13\n\t" - "orr r8, r8, r4, lsr #19\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ A[20] and A[21]\n\t" - "adds r0, #24\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r3, r5, r10\n\t" - "eors r4, r6, r11\n\t" - "lsl r5, r3, #18\n\t" - "orr r5, r5, r4, lsr #14\n\t" - "lsl r6, r4, #18\n\t" - "orr r6, r6, r3, lsr #14\n\t" - "eors r3, r7, r1\n\t" - "eors r4, r8, r2\n\t" - "lsl r7, r3, #2\n\t" - "orr r7, r7, r4, lsr #30\n\t" - "lsl r8, r4, #2\n\t" - "orr r8, r8, r3, lsr #30\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ XOR t2 into A[5*i+2] and t3 into A[5*i+3]; each A[i] is also\n\t" - "@ rotated left by some amount. We reload t2 into r1:r2 and t3\n\t" - "@ into r3:r4.\n\t" - "addw r5, sp, #4\n\t" - "ldm r5!, { r1, r2, r3, r4 }\n\t" - "\n\t" - "@ A[2] and A[3]\n\t" - "subs r0, #160\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r10, r5, r1\n\t" - "eors r11, r6, r2\n\t" - "lsl r5, r11, #30\n\t" - "orr r5, r5, r10, lsr #2\n\t" - "lsl r6, r10, #30\n\t" - "orr r6, r6, r11, lsr #2\n\t" - "eors r10, r7, r3\n\t" - "eors r11, r8, r4\n\t" - "lsl r7, r10, #28\n\t" - "orr r7, r7, r11, lsr #4\n\t" - "lsl r8, r11, #28\n\t" - "orr r8, r8, r10, lsr #4\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ A[7] and A[8]\n\t" - "adds r0, #24\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r10, r5, r1\n\t" - "eors r11, r6, r2\n\t" - "lsl r5, r10, #6\n\t" - "orr r5, r5, r11, lsr #26\n\t" - "lsl r6, r11, #6\n\t" - "orr r6, r6, r10, lsr #26\n\t" - "eors r10, r7, r3\n\t" - "eors r11, r8, r4\n\t" - "lsl r7, r11, #23\n\t" - "orr r7, r7, r10, lsr #9\n\t" - "lsl r8, r10, #23\n\t" - "orr r8, r8, r11, lsr #9\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ A[12] and A[13]\n\t" - "adds r0, #24\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r10, r5, r1\n\t" - "eors r11, r6, r2\n\t" - "lsl r5, r11, #11\n\t" - "orr r5, r5, r10, lsr #21\n\t" - "lsl r6, r10, #11\n\t" - "orr r6, r6, r11, lsr #21\n\t" - "eors r10, r7, r3\n\t" - "eors r11, r8, r4\n\t" - "lsl r7, r10, #25\n\t" - "orr r7, r7, r11, lsr #7\n\t" - "lsl r8, r11, #25\n\t" - "orr r8, r8, r10, lsr #7\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ A[17] and A[18]\n\t" - "adds r0, #24\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r10, r5, r1\n\t" - "eors r11, r6, r2\n\t" - "lsl r5, r10, #15\n\t" - "orr r5, r5, r11, lsr #17\n\t" - "lsl r6, r11, #15\n\t" - "orr r6, r6, r10, lsr #17\n\t" - "eors r10, r7, r3\n\t" - "eors r11, r8, r4\n\t" - "lsl r7, r10, #21\n\t" - "orr r7, r7, r11, lsr #11\n\t" - "lsl r8, r11, #21\n\t" - "orr r8, r8, r10, lsr #11\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ A[22] and A[23]\n\t" - "adds r0, #24\n\t" - "ldm r0!, { r5, r6, r7, r8 }\n\t" - "eors r10, r5, r1\n\t" - "eors r11, r6, r2\n\t" - "lsl r5, r11, #29\n\t" - "orr r5, r5, r10, lsr #3\n\t" - "lsl r6, r10, #29\n\t" - "orr r6, r6, r11, lsr #3\n\t" - "eors r10, r7, r3\n\t" - "eors r11, r8, r4\n\t" - "lsl r7, r11, #24\n\t" - "orr r7, r7, r10, lsr #8\n\t" - "lsl r8, r10, #24\n\t" - "orr r8, r8, r11, lsr #8\n\t" - "stm r12!, { r5, r6, r7, r8 }\n\t" - "\n\t" - "@ XOR t4 into A[5*i+4]; each A[i] is also rotated left by some\n\t" - "@ amount. We reload t4 into r1:r2.\n\t" - "ldr r1, [sp, #20]\n\t" - "ldr r2, [sp, #24]\n\t" - "\n\t" - "@ A[4]\n\t" - "subs r0, #160\n\t" - "ldm r0!, { r5, r6 }\n\t" - "eors r3, r5, r1\n\t" - "eors r4, r6, r2\n\t" - "lsl r5, r3, #27\n\t" - "orr r5, r5, r4, lsr #5\n\t" - "lsl r6, r4, #27\n\t" - "orr r6, r6, r3, lsr #5\n\t" - "stm r12!, { r5, r6 }\n\t" - "\n\t" - "@ A[9]\n\t" - "adds r0, #32\n\t" - "ldm r0!, { r5, r6 }\n\t" - "eors r3, r5, r1\n\t" - "eors r4, r6, r2\n\t" - "lsl r5, r3, #20\n\t" - "orr r5, r5, r4, lsr #12\n\t" - "lsl r6, r4, #20\n\t" - "orr r6, r6, r3, lsr #12\n\t" - "stm r12!, { r5, r6 }\n\t" - "\n\t" - "@ A[14]\n\t" - "adds r0, #32\n\t" - "ldm r0!, { r5, r6 }\n\t" - "eors r3, r5, r1\n\t" - "eors r4, r6, r2\n\t" - "lsl r5, r4, #7\n\t" - "orr r5, r5, r3, lsr #25\n\t" - "lsl r6, r3, #7\n\t" - "orr r6, r6, r4, lsr #25\n\t" - "stm r12!, { r5, r6 }\n\t" - "\n\t" - "@ A[19]\n\t" - "adds r0, #32\n\t" - "ldm r0!, { r5, r6 }\n\t" - "eors r3, r5, r1\n\t" - "eors r4, r6, r2\n\t" - "lsl r5, r3, #8\n\t" - "orr r5, r5, r4, lsr #24\n\t" - "lsl r6, r4, #8\n\t" - "orr r6, r6, r3, lsr #24\n\t" - "stm r12!, { r5, r6 }\n\t" - "\n\t" - "@ A[24]\n\t" - "adds r0, #32\n\t" - "ldm r0!, { r5, r6 }\n\t" - "eors r3, r5, r1\n\t" - "eors r4, r6, r2\n\t" - "lsl r5, r3, #14\n\t" - "orr r5, r5, r4, lsr #18\n\t" - "lsl r6, r4, #14\n\t" - "orr r6, r6, r3, lsr #18\n\t" - "stm r12!, { r5, r6 }\n\t" - "\n\t" - "subs r0, #200\n\t" - "\n\t" - "@ At that point, the stack buffer at sp+32 contains the words\n\t" - "@ at the following indexes (0 to 24) and offsets (from sp)\n\t" - "@ A[ 0] 0 32\n\t" - "@ A[ 1] 1 40\n\t" - "@ A[ 2] 10 112\n\t" - "@ A[ 3] 11 120\n\t" - "@ A[ 4] 20 192\n\t" - "@ A[ 5] 2 48\n\t" - "@ A[ 6] 3 56\n\t" - "@ A[ 7] 12 128\n\t" - "@ A[ 8] 13 136\n\t" - "@ A[ 9] 21 200\n\t" - "@ A[10] 4 64\n\t" - "@ A[11] 5 72\n\t" - "@ A[12] 14 144\n\t" - "@ A[13] 15 152\n\t" - "@ A[14] 22 208\n\t" - "@ A[15] 6 80\n\t" - "@ A[16] 7 88\n\t" - "@ A[17] 16 160\n\t" - "@ A[18] 17 168\n\t" - "@ A[19] 23 216\n\t" - "@ A[20] 8 96\n\t" - "@ A[21] 9 104\n\t" - "@ A[22] 18 176\n\t" - "@ A[23] 19 184\n\t" - "@ A[24] 24 224\n\t" - -#define KHI_LOAD(s0, s1, s2, s3, s4) \ - "ldr r1, [sp, #(32 + 8 * " #s0 ")]\n\t" \ - "ldr r2, [sp, #(36 + 8 * " #s0 ")]\n\t" \ - "ldr r3, [sp, #(32 + 8 * " #s1 ")]\n\t" \ - "ldr r4, [sp, #(36 + 8 * " #s1 ")]\n\t" \ - "ldr r5, [sp, #(32 + 8 * " #s2 ")]\n\t" \ - "ldr r6, [sp, #(36 + 8 * " #s2 ")]\n\t" \ - "ldr r7, [sp, #(32 + 8 * " #s3 ")]\n\t" \ - "ldr r8, [sp, #(36 + 8 * " #s3 ")]\n\t" \ - "ldr r10, [sp, #(32 + 8 * " #s4 ")]\n\t" \ - "ldr r11, [sp, #(36 + 8 * " #s4 ")]\n\t" - -#define KHI_STEP(op, x0, x1, x2, x3, x4, x5, d) \ - #op " r12, " #x0 ", " #x2 "\n\t" \ - "eors r12, " #x4 "\n\t" \ - "str r12, [r0, #(8 * " #d ")]\n\t" \ - #op " r12, " #x1 ", " #x3 "\n\t" \ - "eors r12, " #x5 "\n\t" \ - "str r12, [r0, #(4 + 8 * " #d ")]\n\t" - - "@ A[0], A[6], A[12], A[18] and A[24]\n\t" - KHI_LOAD(0, 3, 14, 17, 24) - KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 0) - KHI_STEP(orns, r7, r8, r5, r6, r3, r4, 1) - KHI_STEP(ands, r7, r8, r10, r11, r5, r6, 2) - KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 3) - KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 4) - "\n\t" - - "@ A[3], A[9], A[10], A[16] and A[22]\n\t" - KHI_LOAD(11, 21, 4, 7, 18) - KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 5) - KHI_STEP(ands, r7, r8, r5, r6, r3, r4, 6) - KHI_STEP(orns, r7, r8, r10, r11, r5, r6, 7) - KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 8) - KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 9) - "\n\t" - - "@ A[1], A[7], A[13], A[19] and A[20]\n\t" - KHI_LOAD(1, 12, 15, 23, 8) - KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 10) - KHI_STEP(ands, r7, r8, r5, r6, r3, r4, 11) - KHI_STEP(bics, r10, r11, r7, r8, r5, r6, 12) - "mvns r7, r7\n\t" - "mvns r8, r8\n\t" - KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 13) - KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 14) - "\n\t" - - "@ A[4], A[5], A[11], A[17] and A[23]\n\t" - KHI_LOAD(20, 2, 5, 16, 19) - KHI_STEP(ands, r3, r4, r5, r6, r1, r2, 15) - KHI_STEP(orrs, r7, r8, r5, r6, r3, r4, 16) - KHI_STEP(orns, r10, r11, r7, r8, r5, r6, 17) - "mvns r7, r7\n\t" - "mvns r8, r8\n\t" - KHI_STEP(ands, r1, r2, r10, r11, r7, r8, 18) - KHI_STEP(orrs, r1, r2, r3, r4, r10, r11, 19) - "\n\t" - - "@ A[2], A[8], A[14], A[15] and A[21]\n\t" - KHI_LOAD(10, 13, 22, 6, 9) - KHI_STEP(bics, r5, r6, r3, r4, r1, r2, 20) - KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 24) - "mvns r3, r3\n\t" - "mvns r4, r4\n\t" - KHI_STEP(orrs, r7, r8, r5, r6, r3, r4, 21) - KHI_STEP(ands, r7, r8, r10, r11, r5, r6, 22) - KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 23) - "\n\t" - - "@ Get round counter XOR round constant into A[0]\n\t" - "ldr r1, [sp, #0]\n\t" - "adr r2, .process_block_RC\n\t" - "adds r2, r1\n\t" - "ldm r2, { r3, r4 }\n\t" - "ldm r0, { r5, r6 }\n\t" - "eors r5, r3\n\t" - "eors r6, r4\n\t" - "stm r0, { r5, r6 }\n\t" - "\n\t" - "@ Increment round counter, loop until all 24 rounds are done.\n\t" - "\n\t" - "adds r1, #8\n\t" - "str r1, [sp, #0]\n\t" - "cmp r1, #192\n\t" - "blo .process_block_loop\n\t" - - INVERT_WORDS - - "add sp, sp, #232\n\t" - "pop { r1, r2, r3, r4, r5, r6, r7, r8, r10, r11, r12, pc }\n\t" - "\n\t" -".process_block_RC:\n\t" - ".word 0x00000001\n\t" - ".word 0x00000000\n\t" - ".word 0x00008082\n\t" - ".word 0x00000000\n\t" - ".word 0x0000808A\n\t" - ".word 0x80000000\n\t" - ".word 0x80008000\n\t" - ".word 0x80000000\n\t" - ".word 0x0000808B\n\t" - ".word 0x00000000\n\t" - ".word 0x80000001\n\t" - ".word 0x00000000\n\t" - ".word 0x80008081\n\t" - ".word 0x80000000\n\t" - ".word 0x00008009\n\t" - ".word 0x80000000\n\t" - ".word 0x0000008A\n\t" - ".word 0x00000000\n\t" - ".word 0x00000088\n\t" - ".word 0x00000000\n\t" - ".word 0x80008009\n\t" - ".word 0x00000000\n\t" - ".word 0x8000000A\n\t" - ".word 0x00000000\n\t" - ".word 0x8000808B\n\t" - ".word 0x00000000\n\t" - ".word 0x0000008B\n\t" - ".word 0x80000000\n\t" - ".word 0x00008089\n\t" - ".word 0x80000000\n\t" - ".word 0x00008003\n\t" - ".word 0x80000000\n\t" - ".word 0x00008002\n\t" - ".word 0x80000000\n\t" - ".word 0x00000080\n\t" - ".word 0x80000000\n\t" - ".word 0x0000800A\n\t" - ".word 0x00000000\n\t" - ".word 0x8000000A\n\t" - ".word 0x80000000\n\t" - ".word 0x80008081\n\t" - ".word 0x80000000\n\t" - ".word 0x00008080\n\t" - ".word 0x80000000\n\t" - ".word 0x80000001\n\t" - ".word 0x00000000\n\t" - ".word 0x80008008\n\t" - ".word 0x80000000\n\t" - -#undef INVERT_WORDS -#undef KHI_LOAD -#undef KHI_STEP - - ); -} - -#else // yyyASM_CORTEXM4+0 - /* * Round constants. */ @@ -1069,8 +482,6 @@ process_block(uint64_t *A) A[20] = ~A[20]; } -#endif // yyyASM_CORTEXM4- - /* see inner.h */ void Zf(i_shake256_init)(inner_shake256_context *sc) diff --git a/sign.c b/sign.c index 24fa8d6..1dac567 100644 --- a/sign.c +++ b/sign.c @@ -328,7 +328,6 @@ typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma); * is written over (t0,t1). The Gram matrix is modified as well. The * tmp[] buffer must have room for four polynomials. */ -TARGET_AVX2 static void ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx, fpr *restrict t0, fpr *restrict t1, @@ -421,7 +420,6 @@ ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx, * Perform Fast Fourier Sampling for target vector t and LDL tree T. * tmp[] must have size for at least two polynomials of size 2^logn. */ -TARGET_AVX2 static void ffSampling_fft(samplerZ samp, void *samp_ctx, fpr *restrict z0, fpr *restrict z1, @@ -436,123 +434,6 @@ ffSampling_fft(samplerZ samp, void *samp_ctx, * When logn == 2, we inline the last two recursion levels. */ if (logn == 2) { -#if FALCON_AVX2 // yyyAVX2+1 - fpr w0, w1, w2, w3, sigma; - __m128d ww0, ww1, wa, wb, wc, wd; - __m128d wy0, wy1, wz0, wz1; - __m128d half, invsqrt8, invsqrt2, neghi, neglo; - int si0, si1, si2, si3; - - tree0 = tree + 4; - tree1 = tree + 8; - - half = _mm_set1_pd(0.5); - invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052); - invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105); - neghi = _mm_set_pd(-0.0, 0.0); - neglo = _mm_set_pd(0.0, -0.0); - - /* - * We split t1 into w*, then do the recursive invocation, - * with output in w*. We finally merge back into z1. - */ - ww0 = _mm_loadu_pd(&t1[0].v); - ww1 = _mm_loadu_pd(&t1[2].v); - wa = _mm_unpacklo_pd(ww0, ww1); - wb = _mm_unpackhi_pd(ww0, ww1); - wc = _mm_add_pd(wa, wb); - ww0 = _mm_mul_pd(wc, half); - wc = _mm_sub_pd(wa, wb); - wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi); - ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8); - - w2.v = _mm_cvtsd_f64(ww1); - w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1)); - wa = ww1; - sigma = tree1[3]; - si2 = samp(samp_ctx, w2, sigma); - si3 = samp(samp_ctx, w3, sigma); - ww1 = _mm_set_pd((double)si3, (double)si2); - wa = _mm_sub_pd(wa, ww1); - wb = _mm_loadu_pd(&tree1[0].v); - wc = _mm_mul_pd(wa, wb); - wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1)); - wa = _mm_unpacklo_pd(wc, wd); - wb = _mm_unpackhi_pd(wc, wd); - ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo))); - w0.v = _mm_cvtsd_f64(ww0); - w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1)); - sigma = tree1[2]; - si0 = samp(samp_ctx, w0, sigma); - si1 = samp(samp_ctx, w1, sigma); - ww0 = _mm_set_pd((double)si1, (double)si0); - - wc = _mm_mul_pd( - _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)), - invsqrt2); - wa = _mm_add_pd(ww0, wc); - wb = _mm_sub_pd(ww0, wc); - ww0 = _mm_unpacklo_pd(wa, wb); - ww1 = _mm_unpackhi_pd(wa, wb); - _mm_storeu_pd(&z1[0].v, ww0); - _mm_storeu_pd(&z1[2].v, ww1); - - /* - * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. - */ - wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0); - wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1); - wz0 = _mm_loadu_pd(&tree[0].v); - wz1 = _mm_loadu_pd(&tree[2].v); - ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1)); - ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0)); - ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v)); - ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v)); - - /* - * Second recursive invocation. - */ - wa = _mm_unpacklo_pd(ww0, ww1); - wb = _mm_unpackhi_pd(ww0, ww1); - wc = _mm_add_pd(wa, wb); - ww0 = _mm_mul_pd(wc, half); - wc = _mm_sub_pd(wa, wb); - wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi); - ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8); - - w2.v = _mm_cvtsd_f64(ww1); - w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1)); - wa = ww1; - sigma = tree0[3]; - si2 = samp(samp_ctx, w2, sigma); - si3 = samp(samp_ctx, w3, sigma); - ww1 = _mm_set_pd((double)si3, (double)si2); - wa = _mm_sub_pd(wa, ww1); - wb = _mm_loadu_pd(&tree0[0].v); - wc = _mm_mul_pd(wa, wb); - wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1)); - wa = _mm_unpacklo_pd(wc, wd); - wb = _mm_unpackhi_pd(wc, wd); - ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo))); - w0.v = _mm_cvtsd_f64(ww0); - w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1)); - sigma = tree0[2]; - si0 = samp(samp_ctx, w0, sigma); - si1 = samp(samp_ctx, w1, sigma); - ww0 = _mm_set_pd((double)si1, (double)si0); - - wc = _mm_mul_pd( - _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)), - invsqrt2); - wa = _mm_add_pd(ww0, wc); - wb = _mm_sub_pd(ww0, wc); - ww0 = _mm_unpacklo_pd(wa, wb); - ww1 = _mm_unpackhi_pd(wa, wb); - _mm_storeu_pd(&z0[0].v, ww0); - _mm_storeu_pd(&z0[2].v, ww1); - - return; -#else // yyyAVX2+0 fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma; fpr a_re, a_im, b_re, b_im, c_re, c_im; @@ -675,7 +556,6 @@ ffSampling_fft(samplerZ samp, void *samp_ctx, z0[3] = fpr_sub(a_im, c_im); return; -#endif // yyyAVX2- } /* @@ -1090,145 +970,9 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, * Sample an integer value along a half-gaussian distribution centered * on zero and standard deviation 1.8205, with a precision of 72 bits. */ -TARGET_AVX2 int Zf(gaussian0_sampler)(prng *p) { -#if FALCON_AVX2 // yyyAVX2+1 - - /* - * High words. - */ - static const union { - uint16_t u16[16]; - __m256i ymm[1]; - } rhi15 = { - { - 0x51FB, 0x2A69, 0x113E, 0x0568, - 0x014A, 0x003B, 0x0008, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000 - } - }; - - static const union { - uint64_t u64[20]; - __m256i ymm[5]; - } rlo57 = { - { - 0x1F42ED3AC391802, 0x12B181F3F7DDB82, - 0x1CDD0934829C1FF, 0x1754377C7994AE4, - 0x1846CAEF33F1F6F, 0x14AC754ED74BD5F, - 0x024DD542B776AE4, 0x1A1FFDC65AD63DA, - 0x01F80D88A7B6428, 0x001C3FDB2040C69, - 0x00012CF24D031FB, 0x00000949F8B091F, - 0x0000003665DA998, 0x00000000EBF6EBB, - 0x0000000002F5D7E, 0x000000000007098, - 0x0000000000000C6, 0x000000000000001, - 0x000000000000000, 0x000000000000000 - } - }; - - uint64_t lo; - unsigned hi; - __m256i xhi, rhi, gthi, eqhi, eqm; - __m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4; - __m128i t, zt; - int r; - - /* - * Get a 72-bit random value and split it into a low part - * (57 bits) and a high part (15 bits) - */ - lo = prng_get_u64(p); - hi = prng_get_u8(p); - hi = (hi << 7) | (unsigned)(lo >> 57); - lo &= 0x1FFFFFFFFFFFFFF; - - /* - * Broadcast the high part and compare it with the relevant - * values. We need both a "greater than" and an "equal" - * comparisons. - */ - xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(hi)); - rhi = _mm256_loadu_si256(&rhi15.ymm[0]); - gthi = _mm256_cmpgt_epi16(rhi, xhi); - eqhi = _mm256_cmpeq_epi16(rhi, xhi); - - /* - * The result is the number of 72-bit values (among the list of 19) - * which are greater than the 72-bit random value. We first count - * all non-zero 16-bit elements in the first eight of gthi. Such - * elements have value -1 or 0, so we first negate them. - */ - t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15); - zt = _mm_setzero_si128(); - t = _mm_hadd_epi16(t, zt); - t = _mm_hadd_epi16(t, zt); - t = _mm_hadd_epi16(t, zt); - r = _mm_cvtsi128_si32(t); - - /* - * We must look at the low bits for all values for which the - * high bits are an "equal" match; values 8-18 all have the - * same high bits (0). - * On 32-bit systems, 'lo' really is two registers, requiring - * some extra code. - */ -#if defined(__x86_64__) || defined(_M_X64) - xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo)); -#else - { - uint32_t e0, e1; - int32_t f0, f1; - - e0 = (uint32_t)lo; - e1 = (uint32_t)(lo >> 32); - f0 = *(int32_t *)&e0; - f1 = *(int32_t *)&e1; - xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0); - } -#endif - gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo); - gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo); - gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo); - gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo); - gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo); - - /* - * Keep only comparison results that correspond to the non-zero - * elements in eqhi. - */ - gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64( - _mm256_castsi256_si128(eqhi))); - gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64( - _mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8)))); - eqm = _mm256_permute4x64_epi64(eqhi, 0xFF); - gtlo2 = _mm256_and_si256(gtlo2, eqm); - gtlo3 = _mm256_and_si256(gtlo3, eqm); - gtlo4 = _mm256_and_si256(gtlo4, eqm); - - /* - * Add all values to count the total number of "-1" elements. - * Since the first eight "high" words are all different, only - * one element (at most) in gtlo0:gtlo1 can be non-zero; however, - * if the high word of the random value is zero, then many - * elements of gtlo2:gtlo3:gtlo4 can be non-zero. - */ - gtlo0 = _mm256_or_si256(gtlo0, gtlo1); - gtlo0 = _mm256_add_epi64( - _mm256_add_epi64(gtlo0, gtlo2), - _mm256_add_epi64(gtlo3, gtlo4)); - t = _mm_add_epi64( - _mm256_castsi256_si128(gtlo0), - _mm256_extracti128_si256(gtlo0, 1)); - t = _mm_add_epi64(t, _mm_srli_si128(t, 8)); - r -= _mm_cvtsi128_si32(t); - - return r; - -#else // yyyAVX2+0 - static const uint32_t dist[] = { 10745844u, 3068844u, 3741698u, 5559083u, 1580863u, 8248194u, @@ -1281,14 +1025,11 @@ Zf(gaussian0_sampler)(prng *p) z += (int)cc; } return z; - -#endif // yyyAVX2- } /* * Sample a bit with probability exp(-x) for some x >= 0. */ -TARGET_AVX2 static int BerExp(prng *p, fpr x, fpr ccs) { @@ -1352,7 +1093,6 @@ BerExp(prng *p, fpr x, fpr ccs) * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9. */ -TARGET_AVX2 int Zf(sampler)(void *ctx, fpr mu, fpr isigma) { diff --git a/tests/test_falcon.c b/tests/test_falcon.c index efeb497..9274174 100644 --- a/tests/test_falcon.c +++ b/tests/test_falcon.c @@ -2561,7 +2561,6 @@ fpr_scaled(int64_t i, int sc) static inline fpr fpr_ldexp(fpr x, int e) { -#if FALCON_FPEMU uint32_t ex; /* @@ -2578,12 +2577,8 @@ fpr_ldexp(fpr x, int e) x = (x & (((uint64_t)1 << 63) + ((uint64_t)1 << 52) - (uint64_t)1)) | ((uint64_t)ex << 52); return x; -#else - return FPR(ldexp(x.v, e)); -#endif } -TARGET_AVX2 static void test_FP_block(void) {