diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index f2411e00..00000000 --- a/.travis.yml +++ /dev/null @@ -1,41 +0,0 @@ -language: rust - -rust: - - stable - - nightly - -env: - # Tests the u32 backend - - TEST_COMMAND=test EXTRA_FLAGS='--no-default-features' FEATURES='std u32_backend' - # Tests the u64 backend - - TEST_COMMAND=test EXTRA_FLAGS='--no-default-features' FEATURES='std u64_backend' - # Tests the fiat_u32 backend - - TEST_COMMAND=test EXTRA_FLAGS='--no-default-features' FEATURES='std fiat_u32_backend' - # Tests the fiat_u64 backend - - TEST_COMMAND=test EXTRA_FLAGS='--no-default-features' FEATURES='std fiat_u64_backend' - # Tests the simd backend - - TEST_COMMAND=test EXTRA_FLAGS='--no-default-features' FEATURES='std simd_backend' - # Tests serde support and default feature selection - - TEST_COMMAND=test EXTRA_FLAGS='' FEATURES='serde' - # Tests building without std. We have to select a backend, so we select the one - # most likely to be useful in an embedded environment. - - TEST_COMMAND=build EXTRA_FLAGS='--no-default-features' FEATURES='u32_backend' - # Tests no_std+alloc usage using the most embedded-friendly backend - - TEST_COMMAND=test EXTRA_FLAGS='--lib --no-default-features' FEATURES='alloc u32_backend' - -matrix: - exclude: - # Test the simd backend only on nightly - - rust: stable - env: TEST_COMMAND=test EXTRA_FLAGS='--no-default-features' FEATURES='std simd_backend' - # Test no_std+alloc only on nightly - - rust: stable - env: TEST_COMMAND=test EXTRA_FLAGS='--lib --no-default-features' FEATURES='alloc u32_backend' - -script: - - cargo $TEST_COMMAND --features="$FEATURES" $EXTRA_FLAGS - -notifications: - slack: - rooms: - - dalek-cryptography:Xxv9WotKYWdSoKlgKNqXiHoD#dalek-bots diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 000b9654..00000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,197 +0,0 @@ -# Changelog - -Entries are listed in reverse chronological order per undeprecated -major series. - -## 3.x series - -### 3.2.0 - -* Add support for getting the identity element for the Montgomery - form of curve25519, which is useful in certain protocols for - checking contributory behaviour in derivation of shared secrets. - -### 3.1.2 - -* Revert a commit which mistakenly removed support for `zeroize` traits - for some point types, as well as elligator2 support for Edwards points. - -### 3.1.1 - -* Fix documentation builds on nightly due to syntax changes to - `#![cfg_attr(feature = "nightly", doc = include_str!("../README.md"))]`. - -### 3.1.0 - -* Add support for the Elligator2 encoding for Edwards points. -* Add two optional formally-verified field arithmetic backends which - use the Fiat Crypto project's Rust code, which is generated from - proofs of functional correctness checked by the Coq theorem proving - system. -* Add support for additional sizes of precomputed tables for basepoint - scalar multiplication. -* Fix an unused import. -* Add support for using the `zeroize` traits with all point types. - Note that points are not automatically zeroized on Drop, but that - consumers of `curve25519-dalek` should call these methods manually - when needed. - -### 3.0.3 - -* Fix documentation builds on nightly due to syntax changes to - `#![cfg_attr(feature = "nightly", doc = include_str!("../README.md"))]`. - -### 3.0.2 - -* Multiple documentation typo fixes. -* Fixes to make using `alloc`+`no_std` possible for stable Rust. - -### 3.0.1 - -* Update the optional `packed-simd` dependency to rely on a newer, - maintained version of the `packed-simd-2` crate. - -### 3.0.0 - -* Update the `digest` dependency to `0.9`. This requires a major version - because the `digest` traits are part of the public API, but there are - otherwise no changes to the API. - -## 2.x series - -### 2.1.3 - -* Fix documentation builds on nightly due to syntax changes to - `#![fg_attr(feature = "nightly", doc = include_str!("../README.md"))]`. - -### 2.1.2 - -* Multiple documenation typo fixes. -* Fix `alloc` feature working with stable rust. - -### 2.1.1 - -* Update the optional `packed-simd` dependency to rely on a newer, - maintained version of the `packed-simd-2` crate. - -### 2.1.0 - -* Make `Scalar::from_bits` a `const fn`, allowing its use in `const` contexts. - -### 2.0.0 - -* Fix a data modeling error in the `serde` feature pointed out by Trevor Perrin - which caused points and scalars to be serialized with length fields rather - than as fixed-size 32-byte arrays. This is a breaking change, but it fixes - compatibility with `serde-json` and ensures that the `serde-bincode` encoding - matches the conventional encoding for X/Ed25519. -* Update `rand_core` to `0.5`, allowing use with new `rand` versions. -* Switch from `clear_on_drop` to `zeroize` (by Tony Arcieri). -* Require `subtle = ^2.2.1` and remove the note advising nightly Rust, which is - no longer required as of that version of `subtle`. See the `subtle` - changelog for more details. -* Update `README.md` for `2.x` series. -* Remove the `build.rs` hack which loaded the entire crate into its own - `build.rs` to generate constants, and keep the constants in the source code. - -The only significant change is the data model change to the `serde` feature; -besides the `rand_core` version bump, there are no other user-visible changes. - -## 1.x series - -### 1.2.6 - -* Fixes to make using alloc+no_std possible for stable Rust. - -### 1.2.5 - -* Update the optional `packed-simd` dependency to rely on a newer, - maintained version of the `packed-simd-2` crate. - -### 1.2.4 - -* Specify a semver bound for `clear_on_drop` rather than an exact version, - addressing an issue where changes to inline assembly in rustc prevented - `clear_on_drop` from working without an update. - -### 1.2.3 - -* Fix an issue identified by a Quarkslab audit (and Jack Grigg), where manually - constructing unreduced `Scalar` values, as needed for X/Ed25519, and then - performing scalar/scalar arithmetic could compute incorrect results. -* Switch to upstream Rust intrinsics for the IFMA backend now that they exist in - Rust and don't need to be defined locally. -* Ensure that the NAF computation works correctly, even for parameters never - used elsewhere in the codebase. -* Minor refactoring to EdwardsPoint decompression. -* Fix broken links in documentation. -* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path - root (not quite correctly done in 1.2.2). - -### 1.2.2 - -* Fix a typo in an internal doc-comment. -* Add the "crypto" tag to crate metadata. -* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path - root. - -### 1.2.1 - -* Fix a bug in bucket index calculations in the Pippenger multiscalar algorithm - for very large input sizes. -* Add a more extensive randomized multiscalar multiplication consistency check - to the test suite to prevent regressions. -* Ensure that that multiscalar and NAF computations work correctly on extremal - `Scalar` values constructed via `from_bits`. - -### 1.2.0 - -* New multiscalar multiplication algorithm with better performance for - large problem sizes. The backend algorithm is selected - transparently using the size hints of the input iterators, so no - changes are required for client crates to start using it. -* Equality of Edwards points is now checked in projective coordinates. -* Serde can now be used with `no_std`. - -### 1.1.4 - -* Fix typos in documentation comments. -* Remove unnecessary `Default` bound on `Scalar::from_hash`. - -### 1.1.3 - -* Reverts the change in 1.1.0 to allow owned and borrowed RNGs, which caused a breakage due to a subtle interaction with ownership rules. (The `RngCore` change is retained). - -### 1.1.2 - -* Disabled KaTeX on `docs.rs` pending proper [support upstream](https://github.com/rust-lang/docs.rs/issues/302). - -## 1.1.1 - -* Fixed an issue related to `#[cfg(rustdoc)]` which prevented documenting multiple backends. - -### 1.1.0 - -* Adds support for precomputation for multiscalar multiplication. -* Restructures the internal source tree into `serial` and `vector` backends (no change to external API). -* Adds a new IFMA backend which sets speed records. -* The `avx2_backend` feature is now an alias for the `simd_backend` feature, which autoselects an appropriate vector backend (currently AVX2 or IFMA). -* Replaces the `rand` dependency with `rand_core`. -* Generalizes trait bounds on `RistrettoPoint::random()` and `Scalar::random()` to allow owned and borrowed RNGs and to allow `RngCore` instead of `Rng`. - -### 1.0.3 - -* Adds `ConstantTimeEq` implementation for compressed points. - -### 1.0.2 - -* Fixes a typo in the naming of variables in Ristretto formulas (no change to functionality). - -### 1.0.1 - -* Depends on the stable `2.0` version of `subtle` instead of `2.0.0-pre.0`. - -### 1.0.0 - -Initial stable release. Yanked due to a dependency mistake (see above). - diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index a802fde5..00000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,8 +0,0 @@ -# Code of Conduct - -We follow the [Rust Code of Conduct](http://www.rust-lang.org/conduct.html), -with the following additional clauses: - -* We respect the rights to privacy and anonymity for contributors and people in - the community. If someone wishes to contribute under a pseudonym different to - their primary identity, that wish is to be respected by all contributors. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index d4e0ff8e..00000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,19 +0,0 @@ -# Contributing to curve25519-dalek - -If you have questions or comments, please feel free to email the -authors. - -For feature requests, suggestions, and bug reports, please open an issue on -[our Github](https://github.com/dalek-cryptography/curve25519-dalek). (Or, send us -an email if you're opposed to using Github for whatever reason.) - -Patches are welcomed as pull requests on -[our Github](https://github.com/dalek-cryptography/curve25519-dalek), as well as by -email (preferably sent to all of the authors listed in `Cargo.toml`). - -All issues on curve25519-dalek are mentored, if you want help with a bug just -ask @isislovecruft or @hdevalence. - -Some issues are easier than others. The `easy` label can be used to find the -easy issues. If you want to work on an issue, please leave a comment so that we -can assign it to you! diff --git a/Cargo.toml b/Cargo.toml index 3de9143d..c65bbc02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,25 +1,17 @@ [package] -name = "curve25519-dalek" -# Before incrementing: -# - update CHANGELOG -# - update html_root_url -# - update README if required by semver -# - if README was updated, also update module documentation in src/lib.rs -version = "3.2.0" +name = "noah-curve25519-dalek" +version = "4.0.0" authors = ["Isis Lovecruft ", "Henry de Valence "] readme = "README.md" license = "BSD-3-Clause" -repository = "https://github.com/dalek-cryptography/curve25519-dalek" -homepage = "https://dalek.rs/curve25519-dalek" -documentation = "https://docs.rs/curve25519-dalek" +repository = "https://github.com/FindoraNetwork/curve25519-dalek" categories = ["cryptography", "no-std"] keywords = ["cryptography", "crypto", "ristretto", "curve25519", "ristretto255"] description = "A pure-Rust implementation of group operations on ristretto255 and Curve25519" exclude = [ "**/.gitignore", - ".gitignore", - ".travis.yml", + ".gitignore" ] [package.metadata.docs.rs] @@ -27,9 +19,6 @@ exclude = [ # rustdoc-args = ["--html-in-header", ".cargo/registry/src/github.com-1ecc6299db9ec823/curve25519-dalek-0.13.2/rustdoc-include-katex-header.html"] features = ["nightly", "simd_backend"] -[badges] -travis-ci = { repository = "dalek-cryptography/curve25519-dalek", branch = "master"} - [dev-dependencies] sha2 = { version = "0.10", default-features = false } bincode = "1" diff --git a/Makefile b/Makefile deleted file mode 100644 index 7d870571..00000000 --- a/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -FEATURES := nightly simd_backend - -doc: - cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html - -doc-internal: - cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html --document-private-items - diff --git a/README.md b/README.md index 2600cce1..b1e96733 100644 --- a/README.md +++ b/README.md @@ -1,226 +1,6 @@ +# A fork of curve25519-dalek in Noah -# curve25519-dalek [![](https://img.shields.io/crates/v/curve25519-dalek.svg)](https://crates.io/crates/curve25519-dalek) [![](https://img.shields.io/badge/dynamic/json.svg?label=docs&uri=https%3A%2F%2Fcrates.io%2Fapi%2Fv1%2Fcrates%2Fcurve25519-dalek%2Fversions&query=%24.versions%5B0%5D.num&colorB=4F74A6)](https://doc.dalek.rs) [![](https://travis-ci.org/dalek-cryptography/curve25519-dalek.svg?branch=master)](https://travis-ci.org/dalek-cryptography/curve25519-dalek) +This crate is a fork of curve25519-dalek with some dependencies updates. - - -**A pure-Rust implementation of group operations on Ristretto and Curve25519.** - -`curve25519-dalek` is a library providing group operations on the Edwards and -Montgomery forms of Curve25519, and on the prime-order Ristretto group. - -`curve25519-dalek` is not intended to provide implementations of any particular -crypto protocol. Rather, implementations of those protocols (such as -[`x25519-dalek`][x25519-dalek] and [`ed25519-dalek`][ed25519-dalek]) should use -`curve25519-dalek` as a library. - -`curve25519-dalek` is intended to provide a clean and safe _mid-level_ API for use -implementing a wide range of ECC-based crypto protocols, such as key agreement, -signatures, anonymous credentials, rangeproofs, and zero-knowledge proof -systems. - -In particular, `curve25519-dalek` implements Ristretto, which constructs a -prime-order group from a non-prime-order Edwards curve. This provides the -speed and safety benefits of Edwards curve arithmetic, without the pitfalls of -cofactor-related abstraction mismatches. - -# Documentation - -The semver-stable, public-facing `curve25519-dalek` API is documented -[here][docs-external]. In addition, the unstable internal implementation -details are documented [here][docs-internal]. - -The `curve25519-dalek` documentation requires a custom HTML header to include -KaTeX for math support. Unfortunately `cargo doc` does not currently support -this, but docs can be built using -```sh -make doc -make doc-internal -``` - -# Use - -To import `curve25519-dalek`, add the following to the dependencies section of -your project's `Cargo.toml`: -```toml -curve25519-dalek = "3" -``` - -The sole breaking change in the `3.x` series was an update to the `digest` -version, and in terms of non-breaking changes it includes: - -* support for using `alloc` instead of `std` on stable Rust, -* the Elligator2 encoding for Edwards points, -* a fix to use `packed_simd2`, -* various documentation fixes and improvements, -* support for configurably-sized, precomputed lookup tables for basepoint scalar - multiplication, -* two new formally-verified field arithmetic backends which use the Fiat Crypto - Rust code, which is generated from proofs of functional correctness checked by - the Coq theorem proving system, and -* support for explicitly calling the `zeroize` traits for all point types. - -The `2.x` series has API almost entirely unchanged from the `1.x` series, -except that: - -* an error in the data modeling for the (optional) `serde` feature was - corrected, so that when the `2.x`-series `serde` implementation is used - with `serde-bincode`, the derived serialization matches the usual X/Ed25519 - formats; -* the `rand` version was updated. - -See `CHANGELOG.md` for more details. - -# Backends and Features - -The `nightly` feature enables features available only when using a Rust nightly -compiler. In particular, it is required for rendering documentation and for -the SIMD backends. - -Curve arithmetic is implemented using one of the following backends: - -* a `u32` backend using serial formulas and `u64` products; -* a `u64` backend using serial formulas and `u128` products; -* an `avx2` backend using [parallel formulas][parallel_doc] and `avx2` instructions (sets speed records); -* an `ifma` backend using [parallel formulas][parallel_doc] and `ifma` instructions (sets speed records); - -By default the `u64` backend is selected. To select a specific backend, use: -```sh -cargo build --no-default-features --features "std u32_backend" -cargo build --no-default-features --features "std u64_backend" -# Requires nightly, RUSTFLAGS="-C target_feature=+avx2" to use avx2 -cargo build --no-default-features --features "std simd_backend" -# Requires nightly, RUSTFLAGS="-C target_feature=+avx512ifma" to use ifma -cargo build --no-default-features --features "std simd_backend" -``` -Crates using `curve25519-dalek` can either select a backend on behalf of their -users, or expose feature flags that control the `curve25519-dalek` backend. - -The `std` feature is enabled by default, but it can be disabled for no-`std` -builds using `--no-default-features`. Note that this requires explicitly -selecting an arithmetic backend using one of the `_backend` features. -If no backend is selected, compilation will fail. - -# Safety - -The `curve25519-dalek` types are designed to make illegal states -unrepresentable. For example, any instance of an `EdwardsPoint` is -guaranteed to hold a point on the Edwards curve, and any instance of a -`RistrettoPoint` is guaranteed to hold a valid point in the Ristretto -group. - -All operations are implemented using constant-time logic (no -secret-dependent branches, no secret-dependent memory accesses), -unless specifically marked as being variable-time code. -We believe that our constant-time logic is lowered to constant-time -assembly, at least on `x86_64` targets. - -As an additional guard against possible future compiler optimizations, -the `subtle` crate places an optimization barrier before every -conditional move or assignment. More details can be found in [the -documentation for the `subtle` crate][subtle_doc]. - -Some functionality (e.g., multiscalar multiplication or batch -inversion) requires heap allocation for temporary buffers. All -heap-allocated buffers of potentially secret data are explicitly -zeroed before release. - -However, we do not attempt to zero stack data, for two reasons. -First, it's not possible to do so correctly: we don't have control -over stack allocations, so there's no way to know how much data to -wipe. Second, because `curve25519-dalek` provides a mid-level API, -the correct place to start zeroing stack data is likely not at the -entrypoints of `curve25519-dalek` functions, but at the entrypoints of -functions in other crates. - -The implementation is memory-safe, and contains no significant -`unsafe` code. The SIMD backend uses `unsafe` internally to call SIMD -intrinsics. These are marked `unsafe` only because invoking them on an -inappropriate CPU would cause `SIGILL`, but the entire backend is only -compiled with appropriate `target_feature`s, so this cannot occur. - -# Performance - -Benchmarks are run using [`criterion.rs`][criterion]: - -```sh -cargo bench --no-default-features --features "std u32_backend" -cargo bench --no-default-features --features "std u64_backend" -# Uses avx2 or ifma only if compiled for an appropriate target. -export RUSTFLAGS="-C target_cpu=native" -cargo bench --no-default-features --features "std simd_backend" -``` - -Performance is a secondary goal behind correctness, safety, and -clarity, but we aim to be competitive with other implementations. - -# FFI - -Unfortunately, we have no plans to add FFI to `curve25519-dalek` directly. The -reason is that we use Rust features to provide an API that maintains safety -invariants, which are not possible to maintain across an FFI boundary. For -instance, as described in the _Safety_ section above, invalid points are -impossible to construct, and this would not be the case if we exposed point -operations over FFI. - -However, `curve25519-dalek` is designed as a *mid-level* API, aimed at -implementing other, higher-level primitives. Instead of providing FFI at the -mid-level, our suggestion is to implement the higher-level primitive (a -signature, PAKE, ZKP, etc) in Rust, using `curve25519-dalek` as a dependency, -and have that crate provide a minimal, byte-buffer-oriented FFI specific to -that primitive. - -# Contributing - -Please see [CONTRIBUTING.md][contributing]. - -Patches and pull requests should be make against the `develop` -branch, **not** `master`. - -# About - -**SPOILER ALERT:** *The Twelfth Doctor's first encounter with the Daleks is in -his second full episode, "Into the Dalek". A beleaguered ship of the "Combined -Galactic Resistance" has discovered a broken Dalek that has turned "good", -desiring to kill all other Daleks. The Doctor, Clara and a team of soldiers -are miniaturized and enter the Dalek, which the Doctor names Rusty. They -repair the damage, but accidentally restore it to its original nature, causing -it to go on the rampage and alert the Dalek fleet to the whereabouts of the -rebel ship. However, the Doctor manages to return Rusty to its previous state -by linking his mind with the Dalek's: Rusty shares the Doctor's view of the -universe's beauty, but also his deep hatred of the Daleks. Rusty destroys the -other Daleks and departs the ship, determined to track down and bring an end -to the Dalek race.* - -`curve25519-dalek` is authored by Isis Agora Lovecruft and Henry de Valence. - -Portions of this library were originally a port of [Adam Langley's -Golang ed25519 library](https://github.com/agl/ed25519), which was in -turn a port of the reference `ref10` implementation. Most of this code, -including the 32-bit field arithmetic, has since been rewritten. - -The fast `u32` and `u64` scalar arithmetic was implemented by Andrew Moon, and -the addition chain for scalar inversion was provided by Brian Smith. The -optimised batch inversion was contributed by Sean Bowe and Daira Hopwood. - -The `no_std` and `zeroize` support was contributed by Tony Arcieri. - -The formally verified backends, `fiat_u32_backend` and `fiat_u64_backend`, which -integrate with the Rust generated by the -[Fiat Crypto project](https://github.com/mit-plv/fiat-crypto) were contributed -by François Garillot. - -Thanks also to Ashley Hauck, Lucas Salibian, Manish Goregaokar, Jack Grigg, -Pratyush Mishra, Michael Rosenberg, and countless others for their -contributions. - -[ed25519-dalek]: https://github.com/dalek-cryptography/ed25519-dalek -[x25519-dalek]: https://github.com/dalek-cryptography/x25519-dalek -[contributing]: https://github.com/dalek-cryptography/curve25519-dalek/blob/master/CONTRIBUTING.md -[docs-external]: https://doc.dalek.rs/curve25519_dalek/ -[docs-internal]: https://doc-internal.dalek.rs/curve25519_dalek/ -[criterion]: https://github.com/japaric/criterion.rs -[parallel_doc]: https://doc-internal.dalek.rs/curve25519_dalek/backend/vector/avx2/index.html -[subtle_doc]: https://doc.dalek.rs/subtle/ +Please refer to the [original repository](https://github.com/dalek-cryptography/curve25519-dalek) for more information. +We plan to switch to curve25519-dalek 4.0 when it is ready. \ No newline at end of file diff --git a/benches/dalek_benchmarks.rs b/benches/dalek_benchmarks.rs index 41339ac4..256c07e5 100644 --- a/benches/dalek_benchmarks.rs +++ b/benches/dalek_benchmarks.rs @@ -12,10 +12,10 @@ use criterion::BatchSize; use criterion::Criterion; use criterion::{BenchmarkGroup, BenchmarkId}; -extern crate curve25519_dalek; +extern crate noah_curve25519_dalek; -use curve25519_dalek::constants; -use curve25519_dalek::scalar::Scalar; +use noah_curve25519_dalek::constants; +use noah_curve25519_dalek::scalar::Scalar; static BATCH_SIZES: [usize; 5] = [1, 2, 4, 8, 16]; static MULTISCALAR_SIZES: [usize; 13] = [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768, 1024]; @@ -23,7 +23,7 @@ static MULTISCALAR_SIZES: [usize; 13] = [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, mod edwards_benches { use super::*; - use curve25519_dalek::edwards::EdwardsPoint; + use noah_curve25519_dalek::edwards::EdwardsPoint; fn compress(c: &mut Criterion) { let B = &constants::ED25519_BASEPOINT_POINT; @@ -80,11 +80,11 @@ mod edwards_benches { mod multiscalar_benches { use super::*; - use curve25519_dalek::edwards::EdwardsPoint; - use curve25519_dalek::edwards::VartimeEdwardsPrecomputation; - use curve25519_dalek::traits::MultiscalarMul; - use curve25519_dalek::traits::VartimeMultiscalarMul; - use curve25519_dalek::traits::VartimePrecomputedMultiscalarMul; + use noah_curve25519_dalek::edwards::EdwardsPoint; + use noah_curve25519_dalek::edwards::VartimeEdwardsPrecomputation; + use noah_curve25519_dalek::traits::MultiscalarMul; + use noah_curve25519_dalek::traits::VartimeMultiscalarMul; + use noah_curve25519_dalek::traits::VartimePrecomputedMultiscalarMul; fn construct_scalars(n: usize) -> Vec { let mut rng = thread_rng(); @@ -98,10 +98,6 @@ mod multiscalar_benches { .collect() } - fn construct(n: usize) -> (Vec, Vec) { - (construct_scalars(n), construct_points(n)) - } - fn consttime_multiscalar_mul(c: &mut BenchmarkGroup) { for multiscalar_size in &MULTISCALAR_SIZES { c.bench_with_input( @@ -249,7 +245,7 @@ mod multiscalar_benches { mod ristretto_benches { use super::*; - use curve25519_dalek::ristretto::RistrettoPoint; + use noah_curve25519_dalek::ristretto::RistrettoPoint; fn compress(c: &mut Criterion) { c.bench_function("RistrettoPoint compression", |b| { diff --git a/docs/assets/dalek-logo-clear.png b/docs/assets/dalek-logo-clear.png deleted file mode 100644 index d3170d80..00000000 Binary files a/docs/assets/dalek-logo-clear.png and /dev/null differ diff --git a/docs/assets/dalek-logo.png b/docs/assets/dalek-logo.png deleted file mode 100644 index 83d6a0c5..00000000 Binary files a/docs/assets/dalek-logo.png and /dev/null differ diff --git a/docs/assets/dalek-logo.svg b/docs/assets/dalek-logo.svg deleted file mode 100644 index 3e87a44b..00000000 --- a/docs/assets/dalek-logo.svg +++ /dev/null @@ -1 +0,0 @@ -dalek \ No newline at end of file diff --git a/docs/assets/rustdoc-include-katex-header.html b/docs/assets/rustdoc-include-katex-header.html deleted file mode 100644 index bc4e3d8a..00000000 --- a/docs/assets/rustdoc-include-katex-header.html +++ /dev/null @@ -1,10 +0,0 @@ - - - - - diff --git a/docs/avx2-notes.md b/docs/avx2-notes.md deleted file mode 100644 index ccb50223..00000000 --- a/docs/avx2-notes.md +++ /dev/null @@ -1,140 +0,0 @@ -An AVX2 implementation of the vectorized point operation strategy. - -# Field element representation - -Our strategy is to implement 4-wide multiplication and squaring by -wordslicing, using one 64-bit AVX2 lane for each field element. Field -elements are represented in the usual way as 10 `u32` limbs in radix -\\(25.5\\) (i.e., alternating between \\(2\^{26}\\) for even limbs and -\\(2\^{25}\\) for odd limbs). This has the effect that passing between -the parallel 32-bit AVX2 representation and the serial 64-bit -representation (which uses radix \\(2^{51}\\)) amounts to regrouping -digits. - -The field element representation is oriented around the AVX2 -`vpmuludq` instruction, which multiplies the low 32 bits of each -64-bit lane of each operand to produce a 64-bit result. - -```text,no_run -(a1 ?? b1 ?? c1 ?? d1 ??) -(a2 ?? b2 ?? c2 ?? d2 ??) - -(a1*a2 b1*b2 c1*c2 d1*d2) -``` - -To unpack 32-bit values into 64-bit lanes for use in multiplication -it would be convenient to use the `vpunpck[lh]dq` instructions, -which unpack and interleave the low and high 32-bit lanes of two -source vectors. -However, the AVX2 versions of these instructions are designed to -operate only within 128-bit lanes of the 256-bit vectors, so that -interleaving the low lanes of `(a0 b0 c0 d0 a1 b1 c1 d1)` with zero -gives `(a0 00 b0 00 a1 00 b1 00)`. Instead, we pre-shuffle the data -layout as `(a0 b0 a1 b1 c0 d0 c1 d1)` so that we can unpack the -"low" and "high" parts as - -```text,no_run -(a0 00 b0 00 c0 00 d0 00) -(a1 00 b1 00 c1 00 d1 00) -``` - -The data layout for a vector of four field elements \\( (a,b,c,d) -\\) with limbs \\( a_0, a_1, \ldots, a_9 \\) is as `[u32x8; 5]` in -the form - -```text,no_run -(a0 b0 a1 b1 c0 d0 c1 d1) -(a2 b2 a3 b3 c2 d2 c3 d3) -(a4 b4 a5 b5 c4 d4 c5 d5) -(a6 b6 a7 b7 c6 d6 c7 d7) -(a8 b8 a9 b9 c8 d8 c9 d9) -``` - -Since this breaks cleanly into two 128-bit lanes, it may be possible -to adapt it to 128-bit vector instructions such as NEON without too -much difficulty. - -# Avoiding Overflow in Doubling - -To analyze the size of the field element coefficients during the -computations, we can parameterize the bounds on the limbs of each -field element by \\( b \in \mathbb R \\) representing the excess bits -above that limb's radix, so that each limb is bounded by either -\\(2\^{25+b} \\) or \\( 2\^{26+b} \\), as appropriate. - -The multiplication routine requires that its inputs are bounded with -\\( b < 1.75 \\), in order to fit a multiplication by \\( 19 \\) -into 32 bits. Since \\( \lg 19 < 4.25 \\), \\( 19x < 2\^{32} \\) -when \\( x < 2\^{27.75} = 2\^{26 + 1.75} \\). However, this is only -required for one of the inputs; the other can grow up to \\( b < 2.5 -\\). - -In addition, the multiplication and squaring routines do not -canonically reduce their outputs, but can leave some small uncarried -excesses, so that their reduced outputs are bounded with -\\( b < 0.007 \\). - -The non-parallel portion of the doubling formulas is -$$ -\begin{aligned} -(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 ) -&\gets -(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4) -\end{aligned} -$$ - -Computing \\( (S\_5, S\_6, S\_8, S\_9 ) \\) as -$$ -\begin{matrix} - & S\_1 & S\_1 & S\_1 & S\_1 \\\\ -+& S\_2 & & & S\_2 \\\\ -+& & & S\_3 & \\\\ -+& & & S\_3 & \\\\ -+& & 2p & 2p & 2p \\\\ --& & S\_2 & S\_2 & \\\\ --& & & & S\_4 \\\\ -=& S\_5 & S\_6 & S\_8 & S\_9 -\end{matrix} -$$ -results in bit-excesses \\( < (1.01, 1.60, 2.33, 2.01)\\) for -\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute -are then -$$ -\begin{aligned} -X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 2.01) \\\\ -Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\ -Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\ -T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 2.01) -\end{aligned} -$$ -which are too large: it's not possible to arrange the multiplicands so -that one vector has \\(b < 2.5\\) and the other has \\( b < 1.75 \\). -However, if we flip the sign of \\( S\_4 = S\_0\^2 \\) during -squaring, so that we output \\(S\_4' = -S\_4 \pmod p\\), then we can -compute -$$ -\begin{matrix} - & S\_1 & S\_1 & S\_1 & S\_1 \\\\ -+& S\_2 & & & S\_2 \\\\ -+& & & S\_3 & \\\\ -+& & & S\_3 & \\\\ -+& & & & S\_4' \\\\ -+& & 2p & 2p & \\\\ --& & S\_2 & S\_2 & \\\\ -=& S\_5 & S\_6 & S\_8 & S\_9 -\end{matrix} -$$ -resulting in bit-excesses \\( < (1.01, 1.60, 2.33, 1.60)\\) for -\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute -are then -$$ -\begin{aligned} -X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 1.60) \\\\ -Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\ -Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\ -T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 1.60) -\end{aligned} -$$ -whose right-hand sides are all bounded with \\( b < 1.75 \\) and -whose left-hand sides are all bounded with \\( b < 2.5 \\), -so that we can avoid any intermediate reductions. diff --git a/docs/ifma-notes.md b/docs/ifma-notes.md deleted file mode 100644 index c6fd3b3a..00000000 --- a/docs/ifma-notes.md +++ /dev/null @@ -1,580 +0,0 @@ -An AVX512-IFMA implementation of the vectorized point operation -strategy. - -# IFMA instructions - -AVX512-IFMA is an extension to AVX-512 consisting of two instructions: - -* `vpmadd52luq`: packed multiply of unsigned 52-bit integers and add - the low 52 product bits to 64-bit accumulators; -* `vpmadd52huq`: packed multiply of unsigned 52-bit integers and add - the high 52 product bits to 64-bit accumulators; - -These operate on 64-bit lanes of their source vectors, taking the low -52 bits of each lane of each source vector, computing the 104-bit -products of each pair, and then adding either the high or low 52 bits -of the 104-bit products to the 64-bit lanes of the destination vector. -The multiplication is performed internally by reusing circuitry for -floating-point arithmetic. Although these instructions are part of -AVX512, the AVX512VL (vector length) extension (present whenever IFMA -is) allows using them with 512, 256, or 128-bit operands. - -This provides a major advantage to vectorized integer operations: -previously, vector operations could only use a \\(32 \times 32 -\rightarrow 64\\)-bit multiplier, while serial code could use a -\\(64\times 64 \rightarrow 128\\)-bit multiplier. - -## IFMA for big-integer multiplications - -A detailed example of the intended use of the IFMA instructions can be -found in a 2016 paper by Gueron and Krasnov, [_Accelerating Big -Integer Arithmetic Using Intel IFMA Extensions_][2016_gueron_krasnov]. -The basic idea is that multiplication of large integers (such as 1024, -2048, or more bits) can be performed as follows. - -First, convert a “packed” 64-bit representation -\\[ -\begin{aligned} -x &= x'_0 + x'_1 2^{64} + x'_2 2^{128} + \cdots \\\\ -y &= y'_0 + y'_1 2^{64} + y'_2 2^{128} + \cdots -\end{aligned} -\\] -into a “redundant” 52-bit representation -\\[ -\begin{aligned} -x &= x_0 + x_1 2^{52} + x_2 2^{104} + \cdots \\\\ -y &= y_0 + y_1 2^{52} + y_2 2^{104} + \cdots -\end{aligned} -\\] -with each \\(x_i, y_j\\) in a 64-bit lane. - -Writing the product as \\(z = z_0 + z_1 2^{52} + z_2 2^{104} + \cdots\\), -the “schoolbook” multiplication strategy gives -\\[ -\begin{aligned} -&z_0 &&=& x_0 & y_0 & & & & & & & & \\\\ -&z_1 &&=& x_1 & y_0 &+ x_0 & y_1 & & & & & & \\\\ -&z_2 &&=& x_2 & y_0 &+ x_1 & y_1 &+ x_0 & y_2 & & & & \\\\ -&z_3 &&=& x_3 & y_0 &+ x_2 & y_1 &+ x_1 & y_2 &+ x_0 & y_3 & & \\\\ -&z_4 &&=& \vdots\\;&\\;\vdots &+ x_3 & y_1 &+ x_2 & y_2 &+ x_1 & y_3 &+ \cdots& \\\\ -&z_5 &&=& & & \vdots\\;&\\;\vdots &+ x_3 & y_2 &+ x_2 & y_3 &+ \cdots& \\\\ -&z_6 &&=& & & & & \vdots\\;&\\;\vdots &+ x_3 & y_3 &+ \cdots& \\\\ -&z_7 &&=& & & & & & & \vdots\\;&\\;\vdots &+ \cdots& \\\\ -&\vdots&&=& & & & & & & & & \ddots& \\\\ -\end{aligned} -\\] -Notice that the product coefficient \\(z_k\\), representing the value -\\(z_k 2^{52k}\\), is the sum of all product terms -\\( -(x_i 2^{52 i}) (y_j 2^{52 j}) -\\) -with \\(k = i + j\\). -Write the IFMA operators \\(\mathrm{lo}(a,b)\\), denoting the low -\\(52\\) bits of \\(ab\\), and -\\(\mathrm{hi}(a,b)\\), denoting the high \\(52\\) bits of -\\(ab\\). -Now we can rewrite the product terms as -\\[ -\begin{aligned} -(x_i 2^{52 i}) (y_j 2^{52 j}) -&= -2^{52 (i+j)}( -\mathrm{lo}(x_i, y_j) + -\mathrm{hi}(x_i, y_j) 2^{52} -) -\\\\ -&= -\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} + -\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)}. -\end{aligned} -\\] -This means that the low half of \\(x_i y_j\\) can be accumulated onto -the product limb \\(z_{i+j}\\) and the high half can be directly -accumulated onto the next-higher product limb \\(z_{i+j+1}\\) with no -additional operations. This allows rewriting the schoolbook -multiplication into the form -\\[ -\begin{aligned} -&z_0 &&=& \mathrm{lo}(x_0,&y_0) & & & & & & & & & & \\\\ -&z_1 &&=& \mathrm{lo}(x_1,&y_0) &+\mathrm{hi}(x_0,&y_0) &+\mathrm{lo}(x_0,&y_1) & & & & & & \\\\ -&z_2 &&=& \mathrm{lo}(x_2,&y_0) &+\mathrm{hi}(x_1,&y_0) &+\mathrm{lo}(x_1,&y_1) &+\mathrm{hi}(x_0,&y_1) &+\mathrm{lo}(x_0,&y_2) & & \\\\ -&z_3 &&=& \mathrm{lo}(x_3,&y_0) &+\mathrm{hi}(x_2,&y_0) &+\mathrm{lo}(x_2,&y_1) &+\mathrm{hi}(x_1,&y_1) &+\mathrm{lo}(x_1,&y_2) &+ \cdots& \\\\ -&z_4 &&=& \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_0) &+\mathrm{lo}(x_3,&y_1) &+\mathrm{hi}(x_2,&y_1) &+\mathrm{lo}(x_2,&y_2) &+ \cdots& \\\\ -&z_5 &&=& & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_1) &+\mathrm{lo}(x_3,&y_2) &+ \cdots& \\\\ -&z_6 &&=& & & & & & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+ \cdots& \\\\ -&\vdots&&=& & & & & & & & & & & \ddots& \\\\ -\end{aligned} -\\] -Gueron and Krasnov implement multiplication by constructing vectors -out of the columns of this diagram, so that the source operands for -the IFMA instructions are of the form \\((x_0, x_1, x_2, \ldots)\\) -and \\((y_i, y_i, y_i, \ldots)\\). -After performing the multiplication, -the product terms \\(z_i\\) are then repacked into a 64-bit representation. - -## An alternative strategy - -The strategy described above is aimed at big-integer multiplications, -such as 1024, 2048, or 4096 bits, which would be used for applications -like RSA. However, elliptic curve cryptography uses much smaller field -sizes, such as 256 or 384 bits, so a different strategy is needed. - -The parallel Edwards formulas provide parallelism at the level of the -formulas for curve operations. This means that instead of scanning -through the terms of the source operands and parallelizing *within* a -field element (as described above), we can arrange the computation in -product-scanning form and parallelize *across* field elements (as -described below). - -The parallel Edwards -formulas provide 4-way parallelism, so they can be implemented using -256-bit vectors using a single 64-bit lane for each element, or using -512-bit vectors using two 64-bit lanes. -The only available CPU supporting IFMA (the -i3-8121U) executes 512-bit IFMA instructions at half rate compared to -256-bit instructions, so for now there's no throughput advantage to -using 512-bit IFMA instructions, and this implementation uses 256-bit -vectors. - -To extend this to 512-bit vectors, it's only only necessary to achieve -2-way parallelism, and it's possible (with a small amount of overhead) -to create a hybrid strategy that operates entirely within 128-bit -lanes. This means that cross-lane operations can use the faster -`vpshufd` (1c latency) instead of a general shuffle instruction (3c -latency). - -# Choice of radix - -The inputs to IFMA instructions are 52 bits wide, so the radix \\(r\\) -used to represent a multiprecision integer must be \\( r \leq 52 \\). -The obvious choice is the "native" radix \\(r = 52\\). - -As described above, this choice -has the advantage that for \\(x_i, y_j \in [0,2^{52})\\), the product term -\\[ -\begin{aligned} -(x_i 2^{52 i}) (y_j 2^{52 j}) -&= -2^{52 (i+j)}( -\mathrm{lo}(x_i, y_j) + -\mathrm{hi}(x_i, y_j) 2^{52} -) -\\\\ -&= -\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} + -\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)}, -\end{aligned} -\\] -so that the low and high halves of the product can be directly accumulated -onto the product limbs. -In contrast, when using a smaller radix \\(r = 52 - k\\), -the product term has the form -\\[ -\begin{aligned} -(x_i 2^{r i}) (y_j 2^{r j}) -&= -2^{r (i+j)}( -\mathrm{lo}(x_i, y_j) + -\mathrm{hi}(x_i, y_j) 2^{52} -) -\\\\ -&= -\mathrm{lo}(x_i, y_j) 2^{r (i+j)} + -( -\mathrm{hi}(x_i, y_j) 2^k -) -2^{r (i+j+1)}. -\end{aligned} -\\] -What's happening is that the product \\(x_i y_j\\) of size \\(2r\\) -bits is split not at \\(r\\) but at \\(52\\), so \\(k\\) product bits -are placed into the low half instead of the high half. This means -that the high half of the product cannot be directly accumulated onto -\\(z_{i+j+1}\\), but must first be multiplied by \\(2^k\\) (i.e., left -shifted by \\(k\\)). In addition, the low half of the product is -\\(52\\) bits large instead of \\(r\\) bits. - -## Handling offset product terms - -[Drucker and Gueron][2018_drucker_gueron] analyze the choice of radix -in the context of big-integer squaring, outlining three ways to handle -the offset product terms, before concluding that all of them are -suboptimal: - -1. Shift the results after accumulation; -2. Shift the input operands before multiplication; -3. Split the MAC operation, accumulating into a zeroed register, - shifting the result, and then adding. - -The first option is rejected because it could double-shift some -previously accumulated terms, the second doesn't work because the -inputs could become larger than \\(52\\) bits, and the third requires -additional instructions to handle the shifting and adding. - -Based on an analysis of total number of instructions, they suggest an -addition to the instruction set, which they call `FMSA` (fused -multiply-shift-add). This would shift the result according to an 8-bit -immediate value before accumulating it into the destination register. - -However, this change to the instruction set doesn't seem to be -necessary. Instead, the product terms can be grouped according to -their coefficients, accumulated together, then shifted once before -adding them to the final sum. This uses an extra register, shift, and -add, but only once per product term (accumulation target), not once -per source term (as in the Drucker-Gueron paper). - -Moreover, because IFMA instructions execute only on two ports -(presumably 0 and 1), while adds and shifts can execute on three ports -(0, 1, and 5), the adds and shifts can execute independently of the -IFMA operations, as long as there is not too much pressure on port 5. -This means that, although the total number of instructions increases, -the shifts and adds do not necessarily increase the execution time, as -long as throughput is limited by IFMA operations. - -Finally, because IFMA instructions have 4 cycle latency and 0.5/1 -cycle throughput (for 256/512 bit vectors), maximizing IFMA throughput -requires either 8 (for 256) or 4 (for 512) independent operations. So -accumulating groups of terms independently before adding them at the -end may be necessary anyways, in order to prevent long chains of -dependent instructions. - -## Advantages of a smaller radix - -Using a smaller radix has other advantages. Although radix \\(52\\) -is an unsaturated representation from the point of view of the -\\(64\\)-bit accumulators (because up to 4096 product terms can be -accumulated without carries), it's a saturated representation from the -point of view of the multiplier (since \\(52\\)-bit values are the -maximum input size). - -Because the inputs to a multiplication must have all of their limbs -bounded by \\(2^{52}\\), limbs in excess of \\(2^{52}\\) must be -reduced before they can be used as an input. The -[Gueron-Krasnov][2016_gueron_krasnov] paper suggests normalizing -values using a standard, sequential carry chain: for each limb, add -the carryin from reducing the previous limb, compute the carryout and -reduce the current limb, then move to the next limb. - -However, when using a smaller radix, such as \\(51\\), each limb can -store a carry bit and still be used as the input to a multiplication. -This means that the inputs do not need to be normalized, and instead -of using a sequential carry chain, we can compute all carryouts in -parallel, reduce all limbs in parallel, and then add the carryins in -parallel (possibly growing the limb values by one bit). - -Because the output of this partial reduction is an acceptable -multiplication input, we can "close the loop" using partial reductions -and never have to normalize to a canonical representation through the -entire computation, in contrast to the Gueron-Krasnov approach, which -converts back to a packed representation after every operation. (This -idea seems to trace back to at least as early as [this 1999 -paper][1999_walter]). - -Using \\(r = 51\\) is enough to keep a carry bit in each limb and -avoid normalizations. What about an even smaller radix? One reason -to choose a smaller radix would be to align the limb boundaries with -an inline reduction (for instance, choosing \\(r = 43\\) for the -Mersenne field \\(p = 2^{127} - 1\\)), but for \\(p = 2^{255 - 19}\\), -\\(r = 51 = 255/5\\) is the natural choice. - -# Multiplication - -The inputs to a multiplication are two field elements -\\[ -\begin{aligned} -x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204} \\\\ -y &= y_0 + y_1 2^{51} + y_2 2^{102} + y_3 2^{153} + y_4 2^{204}, -\end{aligned} -\\] -with limbs in range \\([0,2^{52})\\). - -Writing the product terms as -\\[ -\begin{aligned} -z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\ - &+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459}, -\end{aligned} -\\] -a schoolbook multiplication in product scanning form takes the form -\\[ -\begin{aligned} -z_0 &= x_0 y_0 \\\\ -z_1 &= x_1 y_0 + x_0 y_1 \\\\ -z_2 &= x_2 y_0 + x_1 y_1 + x_0 y_2 \\\\ -z_3 &= x_3 y_0 + x_2 y_1 + x_1 y_2 + x_0 y_3 \\\\ -z_4 &= x_4 y_0 + x_3 y_1 + x_2 y_2 + x_1 y_3 + x_0 y_4 \\\\ -z_5 &= x_4 y_1 + x_3 y_2 + x_2 y_3 + x_1 y_4 \\\\ -z_6 &= x_4 y_2 + x_3 y_3 + x_2 y_4 \\\\ -z_7 &= x_4 y_3 + x_3 y_4 \\\\ -z_8 &= x_4 y_4 \\\\ -z_9 &= 0 \\\\ -\end{aligned} -\\] -Each term \\(x_i y_j\\) can be written in terms of IFMA operations as -\\[ -x_i y_j = \mathrm{lo}(x_i,y_j) + 2\mathrm{hi}(x_i,y_j)2^{51}. -\\] -Substituting this equation into the schoolbook multiplication, then -moving terms to eliminate the \\(2^{51}\\) factors gives -\\[ -\begin{aligned} -z_0 &= \mathrm{lo}(x_0, y_0) \\\\ - &+ \qquad 0 \\\\ -z_1 &= \mathrm{lo}(x_1, y_0) + \mathrm{lo}(x_0, y_1) \\\\ - &+ \qquad 2( \mathrm{hi}(x_0, y_0) )\\\\ -z_2 &= \mathrm{lo}(x_2, y_0) + \mathrm{lo}(x_1, y_1) + \mathrm{lo}(x_0, y_2) \\\\ - &+ \qquad 2( \mathrm{hi}(x_1, y_0) + \mathrm{hi}(x_0, y_1) )\\\\ -z_3 &= \mathrm{lo}(x_3, y_0) + \mathrm{lo}(x_2, y_1) + \mathrm{lo}(x_1, y_2) + \mathrm{lo}(x_0, y_3) \\\\ - &+ \qquad 2( \mathrm{hi}(x_2, y_0) + \mathrm{hi}(x_1, y_1) + \mathrm{hi}(x_0, y_2) )\\\\ -z_4 &= \mathrm{lo}(x_4, y_0) + \mathrm{lo}(x_3, y_1) + \mathrm{lo}(x_2, y_2) + \mathrm{lo}(x_1, y_3) + \mathrm{lo}(x_0, y_4) \\\\ - &+ \qquad 2( \mathrm{hi}(x_3, y_0) + \mathrm{hi}(x_2, y_1) + \mathrm{hi}(x_1, y_2) + \mathrm{hi}(x_0, y_3) )\\\\ -z_5 &= \mathrm{lo}(x_4, y_1) + \mathrm{lo}(x_3, y_2) + \mathrm{lo}(x_2, y_3) + \mathrm{lo}(x_1, y_4) \\\\ - &+ \qquad 2( \mathrm{hi}(x_4, y_0) + \mathrm{hi}(x_3, y_1) + \mathrm{hi}(x_2, y_2) + \mathrm{hi}(x_1, y_3) + \mathrm{hi}(x_0, y_4) )\\\\ -z_6 &= \mathrm{lo}(x_4, y_2) + \mathrm{lo}(x_3, y_3) + \mathrm{lo}(x_2, y_4) \\\\ - &+ \qquad 2( \mathrm{hi}(x_4, y_1) + \mathrm{hi}(x_3, y_2) + \mathrm{hi}(x_2, y_3) + \mathrm{hi}(x_1, y_4) )\\\\ -z_7 &= \mathrm{lo}(x_4, y_3) + \mathrm{lo}(x_3, y_4) \\\\ - &+ \qquad 2( \mathrm{hi}(x_4, y_2) + \mathrm{hi}(x_3, y_3) + \mathrm{hi}(x_2, y_4) )\\\\ -z_8 &= \mathrm{lo}(x_4, y_4) \\\\ - &+ \qquad 2( \mathrm{hi}(x_4, y_3) + \mathrm{hi}(x_3, y_4) )\\\\ -z_9 &= 0 \\\\ - &+ \qquad 2( \mathrm{hi}(x_4, y_4) )\\\\ -\end{aligned} -\\] -As noted above, our strategy will be to multiply and accumulate the -terms with coefficient \\(2\\) separately from those with coefficient -\\(1\\), before combining them at the end. This can alternately be -thought of as accumulating product terms into a *doubly-redundant* -representation, with two limbs for each digit, before collapsing -the doubly-redundant representation by shifts and adds. - -This computation requires 25 `vpmadd52luq` and 25 `vpmadd52huq` -operations. For 256-bit vectors, IFMA operations execute on an -i3-8121U with latency 4 cycles, throughput 0.5 cycles, so executing 50 -instructions requires 25 cycles' worth of throughput. Accumulating -terms with coefficient \\(1\\) and \\(2\\) seperately means that the -longest dependency chain has length 5, so the critical path has length -20 cycles and the bottleneck is throughput. - -# Reduction modulo \\(p\\) - -The next question is how to handle the reduction modulo \\(p\\). -Because \\(p = 2^{255} - 19\\), \\(2^{255} = 19 \pmod p\\), so we can -alternately write -\\[ -\begin{aligned} -z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\ - &+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459} -\end{aligned} -\\] -as -\\[ -\begin{aligned} -z &= (z_0 + 19z_5) + (z_1 + 19z_6) 2^{51} + (z_2 + 19z_7) 2^{102} + (z_3 + 19z_8) 2^{153} + (z_4 + 19z_9) 2^{204}. -\end{aligned} -\\] -When using a \\(64 \times 64 \rightarrow 128\\)-bit multiplier, this -can be handled (as in [Ed25519][ed25519_paper]) by premultiplying -source terms by \\(19\\). Since \\(\lg(19) < 4.25\\), this increases -their size by less than \\(4.25\\) bits, and the rest of the -multiplication can be shown to work out. - -Here, we have at most \\(1\\) bit of headroom. In order to allow -premultiplication, we would need to use radix \\(2^{47}\\), which -would require six limbs instead of five. Instead, we compute the high -terms \\(z_5, \ldots, z_9\\), each using two chains of IFMA -operations, then multiply by \\(19\\) and combine with the lower terms -\\(z_0, \ldots, z_4\\). There are two ways to perform the -multiplication by \\(19\\): using more IFMA operations, or using the -`vpmullq` instruction, which computes the low \\(64\\) bits of a \\(64 -\times 64\\)-bit product. However, `vpmullq` has 15c/1.5c -latency/throughput, in contrast to the 4c/0.5c latency/throughput of -IFMA operations, so it seems like a worse choice. - -The high terms \\(z_5, \ldots, z_9\\) are sums of \\(52\\)-bit terms, -so they are larger than \\(52\\) bits. Write these terms in radix \\(52\\) as -\\[ -z_{5+i} = z_{5+i}' + z_{5+i}'' 2^{52}, \qquad z_{5+i}' < 2^{52}. -\\] -Then the contribution of \\(z_{5+i}\\), taken modulo \\(p\\), is -\\[ -\begin{aligned} -z_{5+i} 2^{255} 2^{51 i} -&= -19 (z_{5+i}' + z_{5+i}'' 2^{52}) 2^{51 i} -\\\\ -&= -19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)} -\\\\ -\end{aligned} -\\] -The products \\(19 z_{5+i}', 19 z_{5+i}''\\) can be written in terms of IFMA operations as -\\[ -\begin{aligned} -19 z_{5+i}' &= \mathrm{lo}(19, z_{5+i}') + 2 \mathrm{hi}(19, z_{5+i}') 2^{51}, \\\\ -19 z_{5+i}'' &= \mathrm{lo}(19, z_{5+i}'') + 2 \mathrm{hi}(19, z_{5+i}'') 2^{51}. \\\\ -\end{aligned} -\\] -Because \\(z_{5+i} < 2^{64}\\), \\(z_{5+i}'' < 2^{12} \\), so \\(19 -z_{5+i}'' < 2^{17} < 2^{52} \\) and \\(\mathrm{hi}(19, z_{5+i}'') = 0\\). -Because IFMA operations ignore the high bits of their source -operands, we do not need to compute \\(z\_{5+i}'\\) explicitly: -the high bits will be ignored. -Combining these observations, we can write -\\[ -\begin{aligned} -z_{5+i} 2^{255} 2^{51 i} -&= -19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)} -\\\\ -&= -\mathrm{lo}(19, z_{5+i}) 2^{51 i} -\+ 2 \mathrm{hi}(19, z_{5+i}) 2^{51 (i+1)} -\+ 2 \mathrm{lo}(19, z_{5+i}/2^{52}) 2^{51 (i+1)}. -\end{aligned} -\\] - -For \\(i = 0,1,2,3\\), this allows reducing \\(z_{5+i}\\) onto -\\(z_{i}, z_{i+1}\\), and if the low terms are computed using a -doubly-redundant representation, no additional shifts are needed to -handle the \\(2\\) coefficients. For \\(i = 4\\), there's a -complication: the contribution becomes -\\[ -\begin{aligned} -z_{9} 2^{255} 2^{204} -&= -\mathrm{lo}(19, z_{9}) 2^{204} -\+ 2 \mathrm{hi}(19, z_{9}) 2^{255} -\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 2^{255} -\\\\ -&= -\mathrm{lo}(19, z_{9}) 2^{204} -\+ 2 \mathrm{hi}(19, z_{9}) 19 -\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 19 -\\\\ -&= -\mathrm{lo}(19, z_{9}) 2^{204} -\+ 2 -\mathrm{lo}(19, \mathrm{hi}(19, z_{9}) + \mathrm{lo}(19, z_{9}/2^{52})). -\\\\ -\end{aligned} -\\] - -It would be possible to cut the number of multiplications from 3 to 2 -by carrying the high part of each \\(z_i\\) onto \\(z_{i+1}\\). This -would eliminate 5 multiplications, clearing 2.5 cycles of port -pressure, at the cost of 5 additions, adding 1.66 cycles of port -pressure. But doing this would create a dependency between terms -(e.g., \\(z_{5}\\) must be computed before the reduction of -\\(z_{6}\\) can begin), whereas with the approach above, all -contributions to all terms are computed independently, to maximize ILP -and flexibility for the processor to schedule instructions. - -This strategy performs 16 IFMA operations, adding two IFMA operations -to each of the \\(2\\)-coefficient terms and one to each of the -\\(1\\)-coefficient terms. Considering the multiplication and -reduction together, we use 66 IFMA operations, requiring 33 cycles' -throughput, while the longest chain of IFMA operations is in the -reduction of \\(z_5\\) onto \\(z_1\\), of length 7 (so 28 cycles, plus -2 cycles to combine the two parts of \\(z_5\\), and the bottleneck is -again throughput. - -Once this is done, we have computed the product terms -\\[ -z = z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204}, -\\] -without reducing the \\(z_i\\) to fit in \\(52\\) bits. Because the -overall flow of operations alternates multiplications and additions or -subtractions, we would have to perform a reduction after an addition -but before the next multiplication anyways, so there's no benefit to -fully reducing the limbs at the end of a multiplication. Instead, we -leave them unreduced, and track the reduction state using the type -system to ensure that unreduced limbs are not accidentally used as an -input to a multiplication. - -# Squaring - -Squaring operates similarly to multiplication, but with the -possibility to combine identical terms. -As before, we write the input as -\\[ -\begin{aligned} -x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204} -\end{aligned} -\\] -with limbs in range \\([0,2^{52})\\). -Writing the product terms as -\\[ -\begin{aligned} -z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\ - &+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459}, -\end{aligned} -\\] -a schoolbook squaring in product scanning form takes the form -\\[ -\begin{aligned} -z_0 &= x_0 x_0 \\\\ -z_1 &= 2 x_1 x_0 \\\\ -z_2 &= 2 x_2 x_0 + x_1 x_1 \\\\ -z_3 &= 2 x_3 x_0 + 2 x_2 x_1 \\\\ -z_4 &= 2 x_4 x_0 + 2 x_3 x_1 + x_2 x_2 \\\\ -z_5 &= 2 x_4 x_1 + 2 x_3 x_2 \\\\ -z_6 &= 2 x_4 x_2 + x_3 x_3 \\\\ -z_7 &= 2 x_4 x_3 \\\\ -z_8 &= x_4 x_4 \\\\ -z_9 &= 0 \\\\ -\end{aligned} -\\] -As before, we write \\(x_i x_j\\) as -\\[ -x_i x_j = \mathrm{lo}(x_i,x_j) + 2\mathrm{hi}(x_i,x_j)2^{51}, -\\] -and substitute to obtain -\\[ -\begin{aligned} -z_0 &= \mathrm{lo}(x_0, x_0) + 0 \\\\ -z_1 &= 2 \mathrm{lo}(x_1, x_0) + 2 \mathrm{hi}(x_0, x_0) \\\\ -z_2 &= 2 \mathrm{lo}(x_2, x_0) + \mathrm{lo}(x_1, x_1) + 4 \mathrm{hi}(x_1, x_0) \\\\ -z_3 &= 2 \mathrm{lo}(x_3, x_0) + 2 \mathrm{lo}(x_2, x_1) + 4 \mathrm{hi}(x_2, x_0) + 2 \mathrm{hi}(x_1, x_1) \\\\ -z_4 &= 2 \mathrm{lo}(x_4, x_0) + 2 \mathrm{lo}(x_3, x_1) + \mathrm{lo}(x_2, x_2) + 4 \mathrm{hi}(x_3, x_0) + 4 \mathrm{hi}(x_2, x_1) \\\\ -z_5 &= 2 \mathrm{lo}(x_4, x_1) + 2 \mathrm{lo}(x_3, x_2) + 4 \mathrm{hi}(x_4, x_0) + 4 \mathrm{hi}(x_3, x_1) + 2 \mathrm{hi}(x_2, x_2) \\\\ -z_6 &= 2 \mathrm{lo}(x_4, x_2) + \mathrm{lo}(x_3, x_3) + 4 \mathrm{hi}(x_4, x_1) + 4 \mathrm{hi}(x_3, x_2) \\\\ -z_7 &= 2 \mathrm{lo}(x_4, x_3) + 4 \mathrm{hi}(x_4, x_2) + 2 \mathrm{hi}(x_3, x_3) \\\\ -z_8 &= \mathrm{lo}(x_4, x_4) + 4 \mathrm{hi}(x_4, x_3) \\\\ -z_9 &= 0 + 2 \mathrm{hi}(x_4, x_4) \\\\ -\end{aligned} -\\] -To implement these, we group terms by their coefficient, computing -those with coefficient \\(2\\) on set of IFMA chains, and on another -set of chains, we begin with coefficient-\\(4\\) terms, then shift -left before continuing with the coefficient-\\(1\\) terms. -The reduction strategy is the same as for multiplication. - -# Future improvements - -LLVM won't use blend operations on [256-bit vectors yet][llvm_blend], -so there's a bunch of blend instructions that could be omitted. - -Although the multiplications and squarings are much faster, there's no -speedup to the additions and subtractions, so there are diminishing -returns. In fact, the complications in the doubling formulas mean -that doubling is actually slower than readdition. This also suggests -that moving to 512-bit vectors won't be much help for a strategy aimed -at parallelism within a group operation, so to extract performance -gains from 512-bit vectors it will probably be necessary to create a -parallel-friendly multiscalar multiplication algorithm. This could -also help with reducing shuffle pressure. - -The squaring implementation could probably be optimized, but without -`perf` support on Cannonlake it's difficult to make actual -measurements. - -Another improvement would be to implement vectorized square root -computations, which would allow creating an iterator adaptor for point -decompression that bunched decompression operations and executed them -in parallel. This would accelerate batch verification. - -[2016_gueron_krasnov]: https://ieeexplore.ieee.org/document/7563269 -[2018_drucker_gueron]: https://eprint.iacr.org/2018/335 -[1999_walter]: https://pdfs.semanticscholar.org/0e6a/3e8f30b63b556679f5dff2cbfdfe9523f4fa.pdf -[ed25519_paper]: https://ed25519.cr.yp.to/ed25519-20110926.pdf -[llvm_blend]: https://bugs.llvm.org/show_bug.cgi?id=38343 diff --git a/docs/parallel-formulas.md b/docs/parallel-formulas.md deleted file mode 100644 index f84d1ccd..00000000 --- a/docs/parallel-formulas.md +++ /dev/null @@ -1,333 +0,0 @@ -Vectorized implementations of field and point operations, using a -modification of the 4-way parallel formulas of Hisil, Wong, Carter, -and Dawson. - -These notes explain the parallel formulas and our strategy for using -them with SIMD operations. There are two backend implementations: one -using AVX2, and the other using AVX512-IFMA. - -# Overview - -The 2008 paper [_Twisted Edwards Curves Revisited_][hwcd08] by Hisil, -Wong, Carter, and Dawson (HWCD) introduced the “extended coordinates” -and mixed-model representations which are used by most Edwards curve -implementations. - -However, they also describe 4-way parallel formulas for point addition -and doubling: a unified addition algorithm taking an effective -\\(2\mathbf M + 1\mathbf D\\), a doubling algorithm taking an -effective \\(1\mathbf M + 1\mathbf S\\), and a dedicated (i.e., for -distinct points) addition algorithm taking an effective \\(2 \mathbf M -\\). They compare these formulas with a 2-way parallel variant of the -Montgomery ladder. - -Unlike their serial formulas, which are used widely, their parallel -formulas do not seem to have been implemented in software before. The -2-way parallel Montgomery ladder was used in 2015 by Tung Chou's -`sandy2x` implementation. Curiously, however, although the [`sandy2x` -paper][sandy2x] also implements Edwards arithmetic, and cites HWCD08, -it doesn't mention their parallel Edwards formulas. -A 2015 paper by Hernández and López describes an AVX2 implementation -of X25519. Neither the paper nor the code are publicly available, but -it apparently gives only a [slight speedup][avx2trac], suggesting that -it uses a 4-way parallel Montgomery ladder rather than parallel -Edwards formulas. - -The reason may be that HWCD08 describe their formulas as operating on -four independent processors, which would make a software -implementation impractical: all of the operations are too low-latency -to effectively synchronize. But a closer inspection reveals that the -(more expensive) multiplication and squaring steps are uniform, while -the instruction divergence occurs in the (much cheaper) addition and -subtraction steps. This means that a SIMD implementation can perform -the expensive steps uniformly, and handle divergence in the -inexpensive steps using masking. - -These notes describe modifications to the original parallel formulas -to allow a SIMD implementation, and this module contains -implementations of the modified formulas targeting either AVX2 or -AVX512-IFMA. - -# Parallel formulas in HWCD'08 - -The doubling formula is presented in the HWCD paper as follows: - -| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 | -|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------| -| | idle | idle | idle | \\( R\_1 \gets X\_1 + Y\_1 \\) | -| \\(1\mathbf S\\) | \\( R\_2 \gets X\_1\^2 \\) | \\( R\_3 \gets Y\_1\^2 \\) | \\( R\_4 \gets Z\_1\^2 \\) | \\( R\_5 \gets R\_1\^2 \\) | -| | \\( R\_6 \gets R\_2 + R\_3 \\) | \\( R\_7 \gets R\_2 - R\_3 \\) | \\( R\_4 \gets 2 R\_4 \\) | idle | -| | idle | \\( R\_1 \gets R\_4 + R\_7 \\) | idle | \\( R\_2 \gets R\_6 - R\_5 \\) | -| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_6 R\_7 \\) | \\( T\_3 \gets R\_2 R\_6 \\) | \\( Z\_3 \gets R\_1 R\_7 \\) | - -and the unified addition algorithm is presented as follows: - -| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 | -|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------| -| | \\( R\_1 \gets Y\_1 - X\_1 \\) | \\( R\_2 \gets Y\_2 - X\_2 \\) | \\( R\_3 \gets Y\_1 + X\_1 \\) | \\( R\_4 \gets Y\_2 + X\_2 \\) | -| \\(1\mathbf M\\) | \\( R\_5 \gets R\_1 R\_2 \\) | \\( R\_6 \gets R\_3 R\_4 \\) | \\( R\_7 \gets T\_1 T\_2 \\) | \\( R\_8 \gets Z\_1 Z\_2 \\) | -| \\(1\mathbf D\\) | idle | idle | \\( R\_7 \gets k R\_7 \\) | \\( R\_8 \gets 2 R\_8 \\) | -| | \\( R\_1 \gets R\_6 - R\_5 \\) | \\( R\_2 \gets R\_8 - R\_7 \\) | \\( R\_3 \gets R\_8 + R\_7 \\) | \\( R\_4 \gets R\_6 + R\_5 \\) | -| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_3 R\_4 \\) | \\( T\_3 \gets R\_1 R\_4 \\) | \\( Z\_3 \gets R\_2 R\_3 \\) | - -Here \\(\mathbf M\\) and \\(\mathbf S\\) represent the cost of -multiplication and squaring of generic field elements, \\(\mathbf D\\) -represents the cost of multiplication by a curve constant (in this -case \\( k = 2d \\)). - -Notice that the \\(1\mathbf M\\) and \\(1\mathbf S\\) steps are -uniform. The non-uniform steps are all inexpensive additions or -subtractions, with the exception of the multiplication by the curve -constant \\(k = 2d\\): -$$ -R\_7 \gets 2 d R\_7. -$$ - -HWCD suggest parallelising this step by breaking \\(k = 2d\\) into four -parts as \\(k = k_0 + 2\^n k_1 + 2\^{2n} k_2 + 2\^{3n} k_3 \\) and -computing \\(k_i R_7 \\) in parallel. This is quite awkward, but if -the curve constant is a ratio \\( d = d\_1/d\_2 \\), then projective -coordinates allow us to instead compute -$$ -(R\_5, R\_6, R\_7, R\_8) \gets (d\_2 R\_5, d\_2 R\_6, 2d\_1 R\_7, d\_2 R\_8). -$$ -This can be performed as a uniform multiplication by a vector of -constants, and if \\(d\_1, d\_2\\) are small, it is relatively -inexpensive. (This trick was suggested by Mike Hamburg). -In the Curve25519 case, we have -$$ -d = \frac{d\_1}{d\_2} = \frac{-121665}{121666}; -$$ -Since \\(2 \cdot 121666 < 2\^{18}\\), all the constants above fit (up -to sign) in 32 bits, so this can be done in parallel as four -multiplications by small constants \\( (121666, 121666, 2\cdot 121665, -2\cdot 121666) \\), followed by a negation to compute \\( - 2\cdot 121665\\). - -# Modified parallel formulas - -Using the modifications sketched above, we can write SIMD-friendly -versions of the parallel formulas as follows. To avoid confusion with -the original formulas, temporary variables are named \\(S\\) instead -of \\(R\\) and are in static single-assignment form. - -## Addition - -To add points -\\(P_1 = (X_1 : Y_1 : Z_1 : T_1) \\) -and -\\(P_2 = (X_2 : Y_2 : Z_2 : T_2 ) \\), -we compute -$$ -\begin{aligned} -(S\_0 &&,&& S\_1 &&,&& S\_2 &&,&& S\_3 ) -&\gets -(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Y\_2 - X\_2&&,&& Y\_2 + X\_2) -\\\\ -(S\_4 &&,&& S\_5 &&,&& S\_6 &&,&& S\_7 ) -&\gets -(S\_0 \cdot S\_2&&,&& S\_1 \cdot S\_3&&,&& Z\_1 \cdot Z\_2&&,&& T\_1 \cdot T\_2) -\\\\ -(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} ) -&\gets -(d\_2 \cdot S\_4 &&,&& d\_2 \cdot S\_5 &&,&& 2 d\_2 \cdot S\_6 &&,&& 2 d\_1 \cdot S\_7 ) -\\\\ -(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15}) -&\gets -(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11}) -\\\\ -(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3) -&\gets -(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13}) -\end{aligned} -$$ -to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = P\_1 + P\_2 \\). -This costs \\( 2\mathbf M + 1 \mathbf D\\). - -## Readdition - -If the point \\( P_2 = (X\_2 : Y\_2 : Z\_2 : T\_2) \\) is fixed, we -can cache the multiplication of the curve constants by computing -$$ -\begin{aligned} -(S\_2' &&,&& S\_3' &&,&& Z\_2' &&,&& T\_2' ) -&\gets -(d\_2 \cdot (Y\_2 - X\_2)&&,&& d\_2 \cdot (Y\_1 + X\_1)&&,&& 2d\_2 \cdot Z\_2 &&,&& 2d\_1 \cdot T\_2). -\end{aligned} -$$ -This costs \\( 1\mathbf D\\); with \\( (S\_2', S\_3', Z\_2', T\_2')\\) -in hand, the addition formulas above become -$$ -\begin{aligned} -(S\_0 &&,&& S\_1 &&,&& Z\_1 &&,&& T\_1 ) -&\gets -(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Z\_1 &&,&& T\_1) -\\\\ -(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} ) -&\gets -(S\_0 \cdot S\_2' &&,&& S\_1 \cdot S\_3'&&,&& Z\_1 \cdot Z\_2' &&,&& T\_1 \cdot T\_2') -\\\\ -(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15}) -&\gets -(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11}) -\\\\ -(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3) -&\gets -(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13}) -\end{aligned} -$$ -which costs only \\( 2\mathbf M \\). This precomputation is -essentially similar to the precomputation that HWCD suggest for their -serial formulas. Because the cost of precomputation and then -readdition is the same as addition, it's sufficient to only -implement caching and readdition. - -## Doubling - -The non-uniform portions of the (re)addition formulas have a fairly -regular structure. Unfortunately, this is not the case for the -doubling formulas, which are much less nice. - -To double a point \\( P = (X\_1 : Y\_1 : Z\_1 : T\_1) \\), we compute -$$ -\begin{aligned} -(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& S\_0) -&\gets -(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& X\_1 + Y\_1) -\\\\ -(S\_1 &&,&& S\_2 &&,&& S\_3 &&,&& S\_4 ) -&\gets -(X\_1\^2 &&,&& Y\_1\^2&&,&& Z\_1\^2 &&,&& S\_0\^2) -\\\\ -(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 ) -&\gets -(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4) -\\\\ -(X\_3 &&,&& Y\_3 &&,&& Z\_3 &&,&& T\_3 ) -&\gets -(S\_8 \cdot S\_9 &&,&& S\_5 \cdot S\_6 &&,&& S\_8 \cdot S\_6 &&,&& S\_5 \cdot S\_9) -\end{aligned} -$$ -to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = [2]P\_1 \\). - -The intermediate step between the squaring and multiplication requires -a long chain of additions. For the IFMA-based implementation, this is not a problem; for the AVX2-based implementation, it is, but with some care and finesse, it's possible to arrange the computation without requiring an intermediate reduction. - -# Implementation - -These formulas aren't specific to a particular representation of field -element vectors, whose optimum choice is determined by the details of -the instruction set. However, it's not possible to perfectly separate -the implementation of the field element vectors from the -implementation of the point operations. Instead, the [`avx2`] and -[`ifma`] backends provide `ExtendedPoint` and `CachedPoint` types, and -the [`scalar_mul`] code uses one of the backend types by a type alias. - -# Comparison to non-vectorized formulas - -In theory, the parallel Edwards formulas seem to allow a \\(4\\)-way -speedup from parallelism. However, an actual vectorized -implementation has several slowdowns that cut into this speedup. - -First, the parallel formulas can only use the available vector -multiplier. For AVX2, this is a \\( 32 \times 32 \rightarrow 64 -\\)-bit integer multiplier, so the speedup from vectorization must -overcome the disadvantage of losing the \\( 64 \times 64 \rightarrow -128\\)-bit (serial) integer multiplier. The effect of this slowdown -is microarchitecture-dependent, since it requires accounting for the -total number of multiplications and additions and their relative -costs. IFMA allows using a \\( 52 \times 52 \rightarrow 104 \\)-bit -multiplier, but the high and low halves need to be computed -separately, and the reduction requires extra work because it's not -possible to pre-multiply by \\(19\\). - -Second, the parallel doubling formulas incur both a theoretical and -practical slowdown. The parallel formulas described above work on the -\\( \mathbb P\^3 \\) “extended” coordinates. The \\( \mathbb P\^2 \\) -model introduced earlier by [Bernstein, Birkner, Joye, Lange, and -Peters][bbjlp08] allows slightly faster doublings, so HWCD suggest -mixing coordinate systems while performing scalar multiplication -(attributing the idea to [a 1998 paper][cmo98] by Cohen, Miyagi, and -Ono). The \\( T \\) coordinate is not required for doublings, so when -doublings are followed by doublings, its computation can be skipped. -More details on this approach and the different coordinate systems can -be found in the [`curve_models` module documentation][curve_models]. - -Unfortunately, this optimization is not compatible with the parallel -formulas, which cannot save time by skipping a single variable, so the -parallel doubling formulas do slightly more work when counting the -total number of field multiplications and squarings. - -In addition, the parallel doubling formulas have a less regular -pattern of additions and subtractions than the parallel addition -formulas, so the vectorization overhead is proportionately greater. -Both the parallel addition and parallel doubling formulas also require -some shuffling to rearrange data within the vectors, which places more -pressure on the shuffle unit than is desirable. - -This means that the speedup from using a vectorized implementation of -parallel Edwards formulas is likely to be greatest in applications -that do fewer doublings and more additions (like a large multiscalar -multiplication) rather than applications that do fewer additions and -more doublings (like a double-base scalar multiplication). - -Third, Amdahl's law says that the speedup is limited to the portion -which can be parallelized. Normally, the field multiplications -dominate the cost of point operations, but with the IFMA backend, the -multiplications are so fast that the non-parallel additions end up as -a significant portion of the total time. - -Fourth, current Intel CPUs perform thermal throttling when using wide -vector instructions. A detailed description can be found in §15.26 of -[the Intel Optimization Manual][intel], but using wide vector -instructions prevents the core from operating at higher frequencies. -The core can return to the higher-frequency state after 2 -milliseconds, but this timer is reset every time high-power -instructions are used. - -Any speedup from vectorization therefore has to be weighed against a -slowdown for the next few million instructions. For a mixed workload, -where point operations are interspersed with other tasks, this can -reduce overall performance. This implementation is therefore probably -not suitable for basic applications, like signatures, but is -worthwhile for complex applications, like zero-knowledge proofs, which -do sustained work. - -# Future work - -There are several directions for future improvement: - -* Using the vectorized field arithmetic code to parallelize across - point operations rather than within a single point operation. This - is less flexible, but would give a speedup both from allowing use of - the faster mixed-model arithmetic and from reducing shuffle - pressure. One approach in this direction would be to implement - batched scalar-point operations using vectors of points (AoSoA - layout). This less generally useful but would give a speedup for - Bulletproofs. - -* Extending the IFMA implementation to use the full width of AVX512, - either handling the extra parallelism internally to a single point - operation (by using a 2-way parallel implementation of field - arithmetic instead of a wordsliced one), or externally, - parallelizing across point operations. Internal parallelism would - be preferable but might require too much shuffle pressure. For now, - the only available CPU which runs IFMA operations executes them at - 256-bits wide anyways, so this isn't yet important. - -* Generalizing the implementation to NEON instructions. The current - point arithmetic code is written in terms of field element vectors, - which are in turn implemented using platform SIMD vectors. It - should be possible to write an alternate implementation of the - `FieldElement2625x4` using NEON without changing the point - arithmetic. NEON has 128-bit vectors rather than 256-bit vectors, - but this may still be worthwhile compared to a serial - implementation. - - -[sandy2x]: https://eprint.iacr.org/2015/943.pdf -[avx2trac]: https://trac.torproject.org/projects/tor/ticket/8897#comment:28 -[hwcd08]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf -[curve_models]: https://doc-internal.dalek.rs/curve25519_dalek/backend/serial/curve_models/index.html -[bbjlp08]: https://eprint.iacr.org/2008/013 -[cmo98]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf -[intel]: https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf diff --git a/src/constants.rs b/src/constants.rs index 90bf38ac..0d932a3f 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -16,8 +16,8 @@ //! scope using a `let` binding: //! //! ``` -//! use curve25519_dalek::constants; -//! use curve25519_dalek::traits::IsIdentity; +//! use noah_curve25519_dalek::constants; +//! use noah_curve25519_dalek::traits::IsIdentity; //! //! let B = &constants::RISTRETTO_BASEPOINT_TABLE; //! let l = &constants::BASEPOINT_ORDER; diff --git a/src/edwards.rs b/src/edwards.rs index 8583ea05..c5176987 100644 --- a/src/edwards.rs +++ b/src/edwards.rs @@ -1161,7 +1161,7 @@ impl EdwardsPoint { /// # Example /// /// ``` - /// use curve25519_dalek::constants; + /// use noah_curve25519_dalek::constants; /// /// // Generator of the prime-order subgroup /// let P = constants::ED25519_BASEPOINT_POINT; @@ -1191,7 +1191,7 @@ impl EdwardsPoint { /// # Example /// /// ``` - /// use curve25519_dalek::constants; + /// use noah_curve25519_dalek::constants; /// /// // Generator of the prime-order subgroup /// let P = constants::ED25519_BASEPOINT_POINT; diff --git a/src/field.rs b/src/field.rs index 95ea3ae1..385f899f 100644 --- a/src/field.rs +++ b/src/field.rs @@ -150,7 +150,7 @@ impl FieldElement { /// Given a slice of public `FieldElements`, replace each with its inverse. /// - /// All input `FieldElements` **MUST** be nonzero. + /// When an input `FieldElement` is zero, its value is unchanged. #[cfg(feature = "alloc")] pub fn batch_invert(inputs: &mut [FieldElement]) { // Montgomery’s Trick and Fast Implementation of Masked AES @@ -167,10 +167,11 @@ impl FieldElement { // products in the scratch space for (input, scratch) in inputs.iter().zip(scratch.iter_mut()) { *scratch = acc; - acc = &acc * input; + // acc <- acc * input, but skipping zeros (constant-time) + acc.conditional_assign(&(&acc * input), !input.is_zero()); } - // acc is nonzero iff all inputs are nonzero + // acc is nonzero because we skipped zeros in inputs assert_eq!(acc.is_zero().unwrap_u8(), 0); // Compute the inverse of all products @@ -180,8 +181,11 @@ impl FieldElement { // in place for (input, scratch) in inputs.iter_mut().rev().zip(scratch.into_iter().rev()) { let tmp = &acc * input; - *input = &acc * &scratch; - acc = tmp; + // input <- acc * scratch, then acc <- tmp + // Again, we skip zeros in a constant-time way + let nz = !input.is_zero(); + input.conditional_assign(&(&acc * &scratch), nz); + acc.conditional_assign(&tmp, nz); } } @@ -362,11 +366,12 @@ mod test { let ap58 = FieldElement::from_bytes(&AP58_BYTES); let asq = FieldElement::from_bytes(&ASQ_BYTES); let ainv = FieldElement::from_bytes(&AINV_BYTES); + let a0 = &a - &a; let a2 = &a + &a; - let a_list = vec![a, ap58, asq, ainv, a2]; + let a_list = vec![a, ap58, asq, ainv, a0, a2]; let mut ainv_list = a_list.clone(); FieldElement::batch_invert(&mut ainv_list[..]); - for i in 0..5 { + for i in 0..6 { assert_eq!(a_list[i].invert(), ainv_list[i]); } } diff --git a/src/lib.rs b/src/lib.rs index 5fa30c18..dd7ad5cc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,234 +15,13 @@ #![cfg_attr(feature = "simd_backend", feature(stdsimd))] // Refuse to compile if documentation is missing. #![deny(missing_docs)] -#![doc(html_logo_url = "https://doc.dalek.rs/assets/dalek-logo-clear.png")] -#![doc(html_root_url = "https://docs.rs/curve25519-dalek/3.2.0")] -//! # curve25519-dalek [![](https://img.shields.io/crates/v/curve25519-dalek.svg)](https://crates.io/crates/curve25519-dalek) [![](https://img.shields.io/badge/dynamic/json.svg?label=docs&uri=https%3A%2F%2Fcrates.io%2Fapi%2Fv1%2Fcrates%2Fcurve25519-dalek%2Fversions&query=%24.versions%5B0%5D.num&colorB=4F74A6)](https://doc.dalek.rs) [![](https://travis-ci.org/dalek-cryptography/curve25519-dalek.svg?branch=master)](https://travis-ci.org/dalek-cryptography/curve25519-dalek) -//! -//! +//! # curve25519-dalek //! //! **A pure-Rust implementation of group operations on Ristretto and Curve25519.** //! //! `curve25519-dalek` is a library providing group operations on the Edwards and //! Montgomery forms of Curve25519, and on the prime-order Ristretto group. -//! -//! `curve25519-dalek` is not intended to provide implementations of any particular -//! crypto protocol. Rather, implementations of those protocols (such as -//! [`x25519-dalek`][x25519-dalek] and [`ed25519-dalek`][ed25519-dalek]) should use -//! `curve25519-dalek` as a library. -//! -//! `curve25519-dalek` is intended to provide a clean and safe _mid-level_ API for use -//! implementing a wide range of ECC-based crypto protocols, such as key agreement, -//! signatures, anonymous credentials, rangeproofs, and zero-knowledge proof -//! systems. -//! -//! In particular, `curve25519-dalek` implements Ristretto, which constructs a -//! prime-order group from a non-prime-order Edwards curve. This provides the -//! speed and safety benefits of Edwards curve arithmetic, without the pitfalls of -//! cofactor-related abstraction mismatches. -//! -//! # Documentation -//! -//! The semver-stable, public-facing `curve25519-dalek` API is documented -//! [here][docs-external]. In addition, the unstable internal implementation -//! details are documented [here][docs-internal]. -//! -//! The `curve25519-dalek` documentation requires a custom HTML header to include -//! KaTeX for math support. Unfortunately `cargo doc` does not currently support -//! this, but docs can be built using -//! ```sh -//! make doc -//! make doc-internal -//! ``` -//! -//! # Use -//! -//! To import `curve25519-dalek`, add the following to the dependencies section of -//! your project's `Cargo.toml`: -//! ```toml -//! curve25519-dalek = "3" -//! ``` -//! -//! The sole breaking change in the `3.x` series was an update to the `digest` -//! version, and in terms of non-breaking changes it includes: -//! -//! * support for using `alloc` instead of `std` on stable Rust, -//! * the Elligator2 encoding for Edwards points, -//! * a fix to use `packed_simd2`, -//! * various documentation fixes and improvements, -//! * support for configurably-sized, precomputed lookup tables for basepoint scalar -//! multiplication, -//! * two new formally-verified field arithmetic backends which use the Fiat Crypto -//! Rust code, which is generated from proofs of functional correctness checked by -//! the Coq theorem proving system, and -//! * support for explicitly calling the `zeroize` traits for all point types. -//! -//! The `2.x` series has API almost entirely unchanged from the `1.x` series, -//! except that: -//! -//! * an error in the data modeling for the (optional) `serde` feature was -//! corrected, so that when the `2.x`-series `serde` implementation is used -//! with `serde-bincode`, the derived serialization matches the usual X/Ed25519 -//! formats; -//! * the `rand` version was updated. -//! -//! See `CHANGELOG.md` for more details. -//! -//! # Backends and Features -//! -//! The `nightly` feature enables features available only when using a Rust nightly -//! compiler. In particular, it is required for rendering documentation and for -//! the SIMD backends. -//! -//! Curve arithmetic is implemented using one of the following backends: -//! -//! * a `u32` backend using serial formulas and `u64` products; -//! * a `u64` backend using serial formulas and `u128` products; -//! * an `avx2` backend using [parallel formulas][parallel_doc] and `avx2` instructions (sets speed records); -//! * an `ifma` backend using [parallel formulas][parallel_doc] and `ifma` instructions (sets speed records); -//! -//! By default the `u64` backend is selected. To select a specific backend, use: -//! ```sh -//! cargo build --no-default-features --features "std u32_backend" -//! cargo build --no-default-features --features "std u64_backend" -//! # Requires nightly, RUSTFLAGS="-C target_feature=+avx2" to use avx2 -//! cargo build --no-default-features --features "std simd_backend" -//! # Requires nightly, RUSTFLAGS="-C target_feature=+avx512ifma" to use ifma -//! cargo build --no-default-features --features "std simd_backend" -//! ``` -//! Crates using `curve25519-dalek` can either select a backend on behalf of their -//! users, or expose feature flags that control the `curve25519-dalek` backend. -//! -//! The `std` feature is enabled by default, but it can be disabled for no-`std` -//! builds using `--no-default-features`. Note that this requires explicitly -//! selecting an arithmetic backend using one of the `_backend` features. -//! If no backend is selected, compilation will fail. -//! -//! # Safety -//! -//! The `curve25519-dalek` types are designed to make illegal states -//! unrepresentable. For example, any instance of an `EdwardsPoint` is -//! guaranteed to hold a point on the Edwards curve, and any instance of a -//! `RistrettoPoint` is guaranteed to hold a valid point in the Ristretto -//! group. -//! -//! All operations are implemented using constant-time logic (no -//! secret-dependent branches, no secret-dependent memory accesses), -//! unless specifically marked as being variable-time code. -//! We believe that our constant-time logic is lowered to constant-time -//! assembly, at least on `x86_64` targets. -//! -//! As an additional guard against possible future compiler optimizations, -//! the `subtle` crate places an optimization barrier before every -//! conditional move or assignment. More details can be found in [the -//! documentation for the `subtle` crate][subtle_doc]. -//! -//! Some functionality (e.g., multiscalar multiplication or batch -//! inversion) requires heap allocation for temporary buffers. All -//! heap-allocated buffers of potentially secret data are explicitly -//! zeroed before release. -//! -//! However, we do not attempt to zero stack data, for two reasons. -//! First, it's not possible to do so correctly: we don't have control -//! over stack allocations, so there's no way to know how much data to -//! wipe. Second, because `curve25519-dalek` provides a mid-level API, -//! the correct place to start zeroing stack data is likely not at the -//! entrypoints of `curve25519-dalek` functions, but at the entrypoints of -//! functions in other crates. -//! -//! The implementation is memory-safe, and contains no significant -//! `unsafe` code. The SIMD backend uses `unsafe` internally to call SIMD -//! intrinsics. These are marked `unsafe` only because invoking them on an -//! inappropriate CPU would cause `SIGILL`, but the entire backend is only -//! compiled with appropriate `target_feature`s, so this cannot occur. -//! -//! # Performance -//! -//! Benchmarks are run using [`criterion.rs`][criterion]: -//! -//! ```sh -//! cargo bench --no-default-features --features "std u32_backend" -//! cargo bench --no-default-features --features "std u64_backend" -//! # Uses avx2 or ifma only if compiled for an appropriate target. -//! export RUSTFLAGS="-C target_cpu=native" -//! cargo bench --no-default-features --features "std simd_backend" -//! ``` -//! -//! Performance is a secondary goal behind correctness, safety, and -//! clarity, but we aim to be competitive with other implementations. -//! -//! # FFI -//! -//! Unfortunately, we have no plans to add FFI to `curve25519-dalek` directly. The -//! reason is that we use Rust features to provide an API that maintains safety -//! invariants, which are not possible to maintain across an FFI boundary. For -//! instance, as described in the _Safety_ section above, invalid points are -//! impossible to construct, and this would not be the case if we exposed point -//! operations over FFI. -//! -//! However, `curve25519-dalek` is designed as a *mid-level* API, aimed at -//! implementing other, higher-level primitives. Instead of providing FFI at the -//! mid-level, our suggestion is to implement the higher-level primitive (a -//! signature, PAKE, ZKP, etc) in Rust, using `curve25519-dalek` as a dependency, -//! and have that crate provide a minimal, byte-buffer-oriented FFI specific to -//! that primitive. -//! -//! # Contributing -//! -//! Please see [CONTRIBUTING.md][contributing]. -//! -//! Patches and pull requests should be make against the `develop` -//! branch, **not** `master`. -//! -//! # About -//! -//! **SPOILER ALERT:** *The Twelfth Doctor's first encounter with the Daleks is in -//! his second full episode, "Into the Dalek". A beleaguered ship of the "Combined -//! Galactic Resistance" has discovered a broken Dalek that has turned "good", -//! desiring to kill all other Daleks. The Doctor, Clara and a team of soldiers -//! are miniaturized and enter the Dalek, which the Doctor names Rusty. They -//! repair the damage, but accidentally restore it to its original nature, causing -//! it to go on the rampage and alert the Dalek fleet to the whereabouts of the -//! rebel ship. However, the Doctor manages to return Rusty to its previous state -//! by linking his mind with the Dalek's: Rusty shares the Doctor's view of the -//! universe's beauty, but also his deep hatred of the Daleks. Rusty destroys the -//! other Daleks and departs the ship, determined to track down and bring an end -//! to the Dalek race.* -//! -//! `curve25519-dalek` is authored by Isis Agora Lovecruft and Henry de Valence. -//! -//! Portions of this library were originally a port of [Adam Langley's -//! Golang ed25519 library](https://!github.com/agl/ed25519), which was in -//! turn a port of the reference `ref10` implementation. Most of this code, -//! including the 32-bit field arithmetic, has since been rewritten. -//! -//! The fast `u32` and `u64` scalar arithmetic was implemented by Andrew Moon, and -//! the addition chain for scalar inversion was provided by Brian Smith. The -//! optimised batch inversion was contributed by Sean Bowe and Daira Hopwood. -//! -//! The `no_std` and `zeroize` support was contributed by Tony Arcieri. -//! -//! The formally verified backends, `fiat_u32_backend` and `fiat_u64_backend`, which -//! integrate with the Rust generated by the -//! [Fiat Crypto project](https://github.com/mit-plv/fiat-crypto) were contributed -//! by François Garillot. -//! -//! Thanks also to Ashley Hauck, Lucas Salibian, Manish Goregaokar, Jack Grigg, -//! Pratyush Mishra, Michael Rosenberg, and countless others for their -//! contributions. -//! -//! [ed25519-dalek]: https://github.com/dalek-cryptography/ed25519-dalek -//! [x25519-dalek]: https://github.com/dalek-cryptography/x25519-dalek -//! [contributing]: https://github.com/dalek-cryptography/curve25519-dalek/blob/master/CONTRIBUTING.md -//! [docs-external]: https://doc.dalek.rs/curve25519_dalek/ -//! [docs-internal]: https://doc-internal.dalek.rs/curve25519_dalek/ -//! [criterion]: https://github.com/japaric/criterion.rs -//! [parallel_doc]: https://doc-internal.dalek.rs/curve25519_dalek/backend/vector/avx2/index.html -//! [subtle_doc]: https://doc.dalek.rs/subtle/ //------------------------------------------------------------------------ // External dependencies: diff --git a/src/ristretto.rs b/src/ristretto.rs index 4f2b48fc..405dc676 100644 --- a/src/ristretto.rs +++ b/src/ristretto.rs @@ -503,8 +503,8 @@ impl RistrettoPoint { /// in a batch. /// /// ``` - /// # extern crate curve25519_dalek; - /// # use curve25519_dalek::ristretto::RistrettoPoint; + /// # extern crate noah_curve25519_dalek; + /// # use noah_curve25519_dalek::ristretto::RistrettoPoint; /// extern crate rand_core; /// use rand_core::OsRng; /// @@ -701,8 +701,8 @@ impl RistrettoPoint { /// # Example /// /// ``` - /// # extern crate curve25519_dalek; - /// # use curve25519_dalek::ristretto::RistrettoPoint; + /// # extern crate noah_curve25519_dalek; + /// # use noah_curve25519_dalek::ristretto::RistrettoPoint; /// extern crate sha2; /// use sha2::Sha512; /// @@ -1019,8 +1019,8 @@ impl RistrettoPoint { /// A precomputed table of multiples of the Ristretto basepoint is /// available in the `constants` module: /// ``` -/// use curve25519_dalek::constants; -/// use curve25519_dalek::scalar::Scalar; +/// use noah_curve25519_dalek::constants; +/// use noah_curve25519_dalek::scalar::Scalar; /// /// let a = Scalar::from(87329482u64); /// let P = &a * &constants::RISTRETTO_BASEPOINT_TABLE; @@ -1067,14 +1067,14 @@ impl ConditionallySelectable for RistrettoPoint { /// /// ``` /// # extern crate subtle; - /// # extern crate curve25519_dalek; + /// # extern crate noah_curve25519_dalek; /// # /// use subtle::ConditionallySelectable; /// use subtle::Choice; /// # - /// # use curve25519_dalek::traits::Identity; - /// # use curve25519_dalek::ristretto::RistrettoPoint; - /// # use curve25519_dalek::constants; + /// # use noah_curve25519_dalek::traits::Identity; + /// # use noah_curve25519_dalek::ristretto::RistrettoPoint; + /// # use noah_curve25519_dalek::constants; /// # fn main() { /// /// let A = RistrettoPoint::identity(); @@ -1510,9 +1510,10 @@ mod test { fn double_and_compress_1024_random_points() { let mut rng = OsRng; - let points: Vec = (0..1024) + let mut points: Vec = (0..1024) .map(|_| RistrettoPoint::random(&mut rng)) .collect(); + points[500] = RistrettoPoint::identity(); let compressed = RistrettoPoint::double_and_compress_batch(&points); diff --git a/src/scalar.rs b/src/scalar.rs index a09b6f6f..3983d8f1 100644 --- a/src/scalar.rs +++ b/src/scalar.rs @@ -34,7 +34,7 @@ //! `Some(Scalar)` in return: //! //! ``` -//! use curve25519_dalek::scalar::Scalar; +//! use noah_curve25519_dalek::scalar::Scalar; //! //! let one_as_bytes: [u8; 32] = Scalar::one().to_bytes(); //! let a: Option = Scalar::from_canonical_bytes(one_as_bytes); @@ -46,7 +46,7 @@ //! (in this case, \\( \ell + 2 \\)), we'll get `None` back: //! //! ``` -//! use curve25519_dalek::scalar::Scalar; +//! use noah_curve25519_dalek::scalar::Scalar; //! //! let l_plus_two_bytes: [u8; 32] = [ //! 0xef, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, @@ -66,7 +66,7 @@ //! resultant scalar \\( \mod \ell \\), producing \\( 2 \\): //! //! ``` -//! use curve25519_dalek::scalar::Scalar; +//! use noah_curve25519_dalek::scalar::Scalar; //! //! let l_plus_two_bytes: [u8; 32] = [ //! 0xef, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, @@ -91,12 +91,12 @@ //! which allows an IUF API. //! //! ``` -//! # extern crate curve25519_dalek; +//! # extern crate noah_curve25519_dalek; //! # extern crate sha2; //! # //! # fn main() { //! use sha2::{Digest, Sha512}; -//! use curve25519_dalek::scalar::Scalar; +//! use noah_curve25519_dalek::scalar::Scalar; //! //! // Hashing a single byte slice //! let a = Scalar::hash_from_bytes::(b"Abolish ICE"); @@ -119,7 +119,7 @@ //! assurances as to reduction modulo the group order: //! //! ``` -//! use curve25519_dalek::scalar::Scalar; +//! use noah_curve25519_dalek::scalar::Scalar; //! //! let l_plus_two_bytes: [u8; 32] = [ //! 0xef, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, @@ -514,7 +514,7 @@ impl From for Scalar { /// # Example /// /// ``` - /// use curve25519_dalek::scalar::Scalar; + /// use noah_curve25519_dalek::scalar::Scalar; /// /// let fourtytwo = Scalar::from(42u64); /// let six = Scalar::from(6u64); @@ -560,10 +560,10 @@ impl Scalar { /// /// ``` /// extern crate rand_core; - /// # extern crate curve25519_dalek; + /// # extern crate noah_curve25519_dalek; /// # /// # fn main() { - /// use curve25519_dalek::scalar::Scalar; + /// use noah_curve25519_dalek::scalar::Scalar; /// /// use rand_core::OsRng; /// @@ -586,8 +586,8 @@ impl Scalar { /// # Example /// /// ``` - /// # extern crate curve25519_dalek; - /// # use curve25519_dalek::scalar::Scalar; + /// # extern crate noah_curve25519_dalek; + /// # use noah_curve25519_dalek::scalar::Scalar; /// extern crate sha2; /// /// use sha2::Sha512; @@ -617,8 +617,8 @@ impl Scalar { /// # Example /// /// ``` - /// # extern crate curve25519_dalek; - /// # use curve25519_dalek::scalar::Scalar; + /// # extern crate noah_curve25519_dalek; + /// # use noah_curve25519_dalek::scalar::Scalar; /// extern crate sha2; /// extern crate digest; /// @@ -658,7 +658,7 @@ impl Scalar { /// # Example /// /// ``` - /// use curve25519_dalek::scalar::Scalar; + /// use noah_curve25519_dalek::scalar::Scalar; /// /// let s: Scalar = Scalar::zero(); /// @@ -673,7 +673,7 @@ impl Scalar { /// # Example /// /// ``` - /// use curve25519_dalek::scalar::Scalar; + /// use noah_curve25519_dalek::scalar::Scalar; /// /// let s: Scalar = Scalar::zero(); /// @@ -713,7 +713,7 @@ impl Scalar { /// # Example /// /// ``` - /// use curve25519_dalek::scalar::Scalar; + /// use noah_curve25519_dalek::scalar::Scalar; /// /// // x = 2238329342913194256032495932344128051776374960164957527413114840482143558222 /// let X: Scalar = Scalar::from_bytes_mod_order([ @@ -757,8 +757,8 @@ impl Scalar { /// # Example /// /// ``` - /// # extern crate curve25519_dalek; - /// # use curve25519_dalek::scalar::Scalar; + /// # extern crate noah_curve25519_dalek; + /// # use noah_curve25519_dalek::scalar::Scalar; /// # fn main() { /// let mut scalars = [ /// Scalar::from(3u64), @@ -1127,9 +1127,9 @@ impl Scalar { /// This is intended for uses like input validation, where variable-time code is acceptable. /// /// ``` - /// # extern crate curve25519_dalek; + /// # extern crate noah_curve25519_dalek; /// # extern crate subtle; - /// # use curve25519_dalek::scalar::Scalar; + /// # use noah_curve25519_dalek::scalar::Scalar; /// # use subtle::ConditionallySelectable; /// # fn main() { /// // 2^255 - 1, since `from_bits` clears the high bit diff --git a/src/traits.rs b/src/traits.rs index d127b3ed..9c43abb4 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -84,10 +84,10 @@ pub trait MultiscalarMul { /// iterators returning either `Scalar`s or `&Scalar`s. /// /// ``` - /// use curve25519_dalek::constants; - /// use curve25519_dalek::traits::MultiscalarMul; - /// use curve25519_dalek::ristretto::RistrettoPoint; - /// use curve25519_dalek::scalar::Scalar; + /// use noah_curve25519_dalek::constants; + /// use noah_curve25519_dalek::traits::MultiscalarMul; + /// use noah_curve25519_dalek::ristretto::RistrettoPoint; + /// use noah_curve25519_dalek::scalar::Scalar; /// /// // Some scalars /// let a = Scalar::from(87329482u64); @@ -136,10 +136,10 @@ pub trait VartimeMultiscalarMul { /// inlining point decompression into the multiscalar call, /// avoiding the need for temporary buffers. /// ``` - /// use curve25519_dalek::constants; - /// use curve25519_dalek::traits::VartimeMultiscalarMul; - /// use curve25519_dalek::ristretto::RistrettoPoint; - /// use curve25519_dalek::scalar::Scalar; + /// use noah_curve25519_dalek::constants; + /// use noah_curve25519_dalek::traits::VartimeMultiscalarMul; + /// use noah_curve25519_dalek::ristretto::RistrettoPoint; + /// use noah_curve25519_dalek::scalar::Scalar; /// /// // Some scalars /// let a = Scalar::from(87329482u64); @@ -199,10 +199,10 @@ pub trait VartimeMultiscalarMul { /// iterators returning either `Scalar`s or `&Scalar`s. /// /// ``` - /// use curve25519_dalek::constants; - /// use curve25519_dalek::traits::VartimeMultiscalarMul; - /// use curve25519_dalek::ristretto::RistrettoPoint; - /// use curve25519_dalek::scalar::Scalar; + /// use noah_curve25519_dalek::constants; + /// use noah_curve25519_dalek::traits::VartimeMultiscalarMul; + /// use noah_curve25519_dalek::ristretto::RistrettoPoint; + /// use noah_curve25519_dalek::scalar::Scalar; /// /// // Some scalars /// let a = Scalar::from(87329482u64); diff --git a/vendor/ristretto.sage b/vendor/ristretto.sage deleted file mode 100644 index 04cf4f92..00000000 --- a/vendor/ristretto.sage +++ /dev/null @@ -1,857 +0,0 @@ -import binascii -class InvalidEncodingException(Exception): pass -class NotOnCurveException(Exception): pass -class SpecException(Exception): pass - -def lobit(x): return int(x) & 1 -def hibit(x): return lobit(2*x) -def negative(x): return lobit(x) -def enc_le(x,n): return bytearray([int(x)>>(8*i) & 0xFF for i in xrange(n)]) -def dec_le(x): return sum(b<<(8*i) for i,b in enumerate(x)) -def randombytes(n): return bytearray([randint(0,255) for _ in range(n)]) - -def optimized_version_of(spec): - """Decorator: This function is an optimized version of some specification""" - def decorator(f): - def wrapper(self,*args,**kwargs): - def pr(x): - if isinstance(x,bytearray): return binascii.hexlify(x) - else: return str(x) - try: spec_ans = getattr(self,spec,spec)(*args,**kwargs),None - except Exception as e: spec_ans = None,e - try: opt_ans = f(self,*args,**kwargs),None - except Exception as e: opt_ans = None,e - if spec_ans[1] is None and opt_ans[1] is not None: - raise - #raise SpecException("Mismatch in %s: spec returned %s but opt threw %s" - # % (f.__name__,str(spec_ans[0]),str(opt_ans[1]))) - if spec_ans[1] is not None and opt_ans[1] is None: - raise - #raise SpecException("Mismatch in %s: spec threw %s but opt returned %s" - # % (f.__name__,str(spec_ans[1]),str(opt_ans[0]))) - if spec_ans[0] != opt_ans[0]: - raise SpecException("Mismatch in %s: %s != %s" - % (f.__name__,pr(spec_ans[0]),pr(opt_ans[0]))) - if opt_ans[1] is not None: raise - else: return opt_ans[0] - wrapper.__name__ = f.__name__ - return wrapper - return decorator - -def xsqrt(x,exn=InvalidEncodingException("Not on curve")): - """Return sqrt(x)""" - if not is_square(x): raise exn - s = sqrt(x) - if negative(s): s=-s - return s - -def isqrt(x,exn=InvalidEncodingException("Not on curve")): - """Return 1/sqrt(x)""" - if x==0: return 0 - if not is_square(x): raise exn - s = sqrt(x) - #if negative(s): s=-s - return 1/s - -def inv0(x): return 1/x if x != 0 else 0 - -def isqrt_i(x): - """Return 1/sqrt(x) or 1/sqrt(zeta * x)""" - if x==0: return True,0 - gen = x.parent(-1) - while is_square(gen): gen = sqrt(gen) - if is_square(x): return True,1/sqrt(x) - else: return False,1/sqrt(x*gen) - -class QuotientEdwardsPoint(object): - """Abstract class for point an a quotiented Edwards curve; needs F,a,d,cofactor to work""" - def __init__(self,x=0,y=1): - x = self.x = self.F(x) - y = self.y = self.F(y) - if y^2 + self.a*x^2 != 1 + self.d*x^2*y^2: - raise NotOnCurveException(str(self)) - - def __repr__(self): - return "%s(0x%x,0x%x)" % (self.__class__.__name__, self.x, self.y) - - def __iter__(self): - yield self.x - yield self.y - - def __add__(self,other): - x,y = self - X,Y = other - a,d = self.a,self.d - return self.__class__( - (x*Y+y*X)/(1+d*x*y*X*Y), - (y*Y-a*x*X)/(1-d*x*y*X*Y) - ) - - def __neg__(self): return self.__class__(-self.x,self.y) - def __sub__(self,other): return self + (-other) - def __rmul__(self,other): return self*other - def __eq__(self,other): - """NB: this is the only method that is different from the usual one""" - x,y = self - X,Y = other - return x*Y == X*y or (self.cofactor==8 and -self.a*x*X == y*Y) - def __ne__(self,other): return not (self==other) - - def __mul__(self,exp): - exp = int(exp) - if exp < 0: exp,self = -exp,-self - total = self.__class__() - work = self - while exp != 0: - if exp & 1: total += work - work += work - exp >>= 1 - return total - - def xyzt(self): - x,y = self - z = self.F.random_element() - return x*z,y*z,z,x*y*z - - def torque(self): - """Apply cofactor group, except keeping the point even""" - if self.cofactor == 8: - if self.a == -1: return self.__class__(self.y*self.i, self.x*self.i) - if self.a == 1: return self.__class__(-self.y, self.x) - else: - return self.__class__(-self.x, -self.y) - - def doubleAndEncodeSpec(self): - return (self+self).encode() - - # Utility functions - @classmethod - def bytesToGf(cls,bytes,mustBeProper=True,mustBePositive=False,maskHiBits=False): - """Convert little-endian bytes to field element, sanity check length""" - if len(bytes) != cls.encLen: - raise InvalidEncodingException("wrong length %d" % len(bytes)) - s = dec_le(bytes) - if mustBeProper and s >= cls.F.order(): - raise InvalidEncodingException("%d out of range!" % s) - bitlen = int(ceil(log(cls.F.order())/log(2))) - if maskHiBits: s &= 2^bitlen-1 - s = cls.F(s) - if mustBePositive and negative(s): - raise InvalidEncodingException("%d is negative!" % s) - return s - - @classmethod - def gfToBytes(cls,x,mustBePositive=False): - """Convert little-endian bytes to field element, sanity check length""" - if negative(x) and mustBePositive: x = -x - return enc_le(x,cls.encLen) - -class RistrettoPoint(QuotientEdwardsPoint): - """The new Ristretto group""" - def encodeSpec(self): - """Unoptimized specification for encoding""" - x,y = self - if self.cofactor==8 and (negative(x*y) or y==0): (x,y) = self.torque() - if y == -1: y = 1 # Avoid divide by 0; doesn't affect impl - - if negative(x): x,y = -x,-y - s = xsqrt(self.mneg*(1-y)/(1+y),exn=Exception("Unimplemented: point is odd: " + str(self))) - return self.gfToBytes(s) - - @classmethod - def decodeSpec(cls,s): - """Unoptimized specification for decoding""" - s = cls.bytesToGf(s,mustBePositive=True) - - a,d = cls.a,cls.d - x = xsqrt(4*s^2 / (a*d*(1+a*s^2)^2 - (1-a*s^2)^2)) - y = (1+a*s^2) / (1-a*s^2) - - if cls.cofactor==8 and (negative(x*y) or y==0): - raise InvalidEncodingException("x*y has high bit") - - return cls(x,y) - - @optimized_version_of("encodeSpec") - def encode(self): - """Encode, optimized version""" - a,d,mneg = self.a,self.d,self.mneg - x,y,z,t = self.xyzt() - - if self.cofactor==8: - u1 = mneg*(z+y)*(z-y) - u2 = x*y # = t*z - isr = isqrt(u1*u2^2) - i1 = isr*u1 # sqrt(mneg*(z+y)*(z-y))/(x*y) - i2 = isr*u2 # 1/sqrt(a*(y+z)*(y-z)) - z_inv = i1*i2*t # 1/z - - if negative(t*z_inv): - if a==-1: - x,y = y*self.i,x*self.i - den_inv = self.magic * i1 - else: - x,y = -y,x - den_inv = self.i * self.magic * i1 - - else: - den_inv = i2 - - if negative(x*z_inv): y = -y - s = (z-y) * den_inv - else: - num = mneg*(z+y)*(z-y) - isr = isqrt(num*y^2) - if negative(isr^2*num*y*t): y = -y - s = isr*y*(z-y) - - return self.gfToBytes(s,mustBePositive=True) - - @optimized_version_of("doubleAndEncodeSpec") - def doubleAndEncode(self): - X,Y,Z,T = self.xyzt() - a,d,mneg = self.a,self.d,self.mneg - - if self.cofactor==8: - e = 2*X*Y - f = Z^2+d*T^2 - g = Y^2-a*X^2 - h = Z^2-d*T^2 - - inv1 = 1/(e*f*g*h) - z_inv = inv1*e*g # 1 / (f*h) - t_inv = inv1*f*h - - if negative(e*g*z_inv): - if a==-1: sqrta = self.i - else: sqrta = -1 - e,f,g,h = g,h,-e,f*sqrta - factor = self.i - else: - factor = self.magic - - if negative(h*e*z_inv): g=-g - s = (h-g)*factor*g*t_inv - - else: - foo = Y^2+a*X^2 - bar = X*Y - den = 1/(foo*bar) - if negative(2*bar^2*den): tmp = a*X^2 - else: tmp = Y^2 - s = self.magic*(Z^2-tmp)*foo*den - - return self.gfToBytes(s,mustBePositive=True) - - @classmethod - @optimized_version_of("decodeSpec") - def decode(cls,s): - """Decode, optimized version""" - s = cls.bytesToGf(s,mustBePositive=True) - - a,d = cls.a,cls.d - yden = 1-a*s^2 - ynum = 1+a*s^2 - yden_sqr = yden^2 - xden_sqr = a*d*ynum^2 - yden_sqr - - isr = isqrt(xden_sqr * yden_sqr) - - xden_inv = isr * yden - yden_inv = xden_inv * isr * xden_sqr - - x = 2*s*xden_inv - if negative(x): x = -x - y = ynum * yden_inv - - if cls.cofactor==8 and (negative(x*y) or y==0): - raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y)) - - return cls(x,y) - - @classmethod - def fromJacobiQuartic(cls,s,t,sgn=1): - """Convert point from its Jacobi Quartic representation""" - a,d = cls.a,cls.d - assert s^4 - 2*cls.a*(1-2*d/(d-a))*s^2 + 1 == t^2 - x = 2*s*cls.magic / t - y = (1+a*s^2) / (1-a*s^2) - return cls(sgn*x,y) - - @classmethod - def elligatorSpec(cls,r0): - a,d = cls.a,cls.d - r = cls.qnr * cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)^2 - den = (d*r-a)*(a*r-d) - if den == 0: return cls() - n1 = cls.a*(r+1)*(a+d)*(d-a)/den - n2 = r*n1 - if is_square(n1): - sgn,s,t = 1, xsqrt(n1), -(r-1)*(a+d)^2 / den - 1 - else: - sgn,s,t = -1,-xsqrt(n2), r*(r-1)*(a+d)^2 / den - 1 - - return cls.fromJacobiQuartic(s,t) - - @classmethod - @optimized_version_of("elligatorSpec") - def elligator(cls,r0): - a,d = cls.a,cls.d - r0 = cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True) - r = cls.qnr * r0^2 - den = (d*r-a)*(a*r-d) - num = cls.a*(r+1)*(a+d)*(d-a) - - iss,isri = isqrt_i(num*den) - if iss: sgn,twiddle = 1,1 - else: sgn,twiddle = -1,r0*cls.qnr - isri *= twiddle - s = isri*num - t = -sgn*isri*s*(r-1)*(d+a)^2 - 1 - if negative(s) == iss: s = -s - return cls.fromJacobiQuartic(s,t) - - -class Decaf_1_1_Point(QuotientEdwardsPoint): - """Like current decaf but tweaked for simplicity""" - def encodeSpec(self): - """Unoptimized specification for encoding""" - a,d = self.a,self.d - x,y = self - if x==0 or y==0: return(self.gfToBytes(0)) - - if self.cofactor==8 and negative(x*y*self.isoMagic): - x,y = self.torque() - - sr = xsqrt(1-a*x^2) - altx = x*y*self.isoMagic / sr - if negative(altx): s = (1+sr)/x - else: s = (1-sr)/x - - return self.gfToBytes(s,mustBePositive=True) - - @classmethod - def decodeSpec(cls,s): - """Unoptimized specification for decoding""" - a,d = cls.a,cls.d - s = cls.bytesToGf(s,mustBePositive=True) - - if s==0: return cls() - t = xsqrt(s^4 + 2*(a-2*d)*s^2 + 1) - altx = 2*s*cls.isoMagic/t - if negative(altx): t = -t - x = 2*s / (1+a*s^2) - y = (1-a*s^2) / t - - if cls.cofactor==8 and (negative(x*y*cls.isoMagic) or y==0): - raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y)) - - return cls(x,y) - - def toJacobiQuartic(self,toggle_rotation=False,toggle_altx=False,toggle_s=False): - "Return s,t on jacobi curve" - a,d = self.a,self.d - x,y,z,t = self.xyzt() - - if self.cofactor == 8: - # Cofactor 8 version - # Simulate IMAGINE_TWIST because that's how libdecaf does it - x = self.i*x - t = self.i*t - a = -a - d = -d - - # OK, the actual libdecaf code should be here - num = (z+y)*(z-y) - den = x*y - isr = isqrt(num*(a-d)*den^2) - - iden = isr * den * self.isoMagic # 1/sqrt((z+y)(z-y)) = 1/sqrt(1-Y^2) / z - inum = isr * num # sqrt(1-Y^2) * z / xysqrt(a-d) ~ 1/sqrt(1-ax^2)/z - - if negative(iden*inum*self.i*t^2*(d-a)) != toggle_rotation: - iden,inum = inum,iden - fac = x*sqrt(a) - toggle=(a==-1) - else: - fac = y - toggle=False - - imi = self.isoMagic * self.i - altx = inum*t*imi - neg_altx = negative(altx) != toggle_altx - if neg_altx != toggle: inum =- inum - - tmp = fac*(inum*z + 1) - s = iden*tmp*imi - - negm1 = (negative(s) != toggle_s) != neg_altx - if negm1: m1 = a*fac + z - else: m1 = a*fac - z - - swap = toggle_s - - else: - # Much simpler cofactor 4 version - num = (x+t)*(x-t) - isr = isqrt(num*(a-d)*x^2) - ratio = isr*num - altx = ratio*self.isoMagic - - neg_altx = negative(altx) != toggle_altx - if neg_altx: ratio =- ratio - - tmp = ratio*z - t - s = (a-d)*isr*x*tmp - - negx = (negative(s) != toggle_s) != neg_altx - if negx: m1 = -a*t + x - else: m1 = -a*t - x - - swap = toggle_s - - if negative(s): s = -s - - return s,m1,a*tmp,swap - - def invertElligator(self,toggle_r=False,*args,**kwargs): - "Produce preimage of self under elligator, or None" - a,d = self.a,self.d - - rets = [] - - tr = [False,True] if self.cofactor == 8 else [False] - for toggle_rotation in tr: - for toggle_altx in [False,True]: - for toggle_s in [False,True]: - for toggle_r in [False,True]: - s,m1,m12,swap = self.toJacobiQuartic(toggle_rotation,toggle_altx,toggle_s) - - #print - #print toggle_rotation,toggle_altx,toggle_s - #print m1 - #print m12 - - - if self == self.__class__(): - if self.cofactor == 4: - # Hacks for identity! - if toggle_altx: m12 = 1 - elif toggle_s: m1 = 1 - elif toggle_r: continue - ## BOTH??? - - else: - m12 = 1 - imi = self.isoMagic * self.i - if toggle_rotation: - if toggle_altx: m1 = -imi - else: m1 = +imi - else: - if toggle_altx: m1 = 0 - else: m1 = a-d - - rnum = (d*a*m12-m1) - rden = ((d*a-1)*m12+m1) - if swap: rnum,rden = rden,rnum - - ok,sr = isqrt_i(rnum*rden*self.qnr) - if not ok: continue - sr *= rnum - #print "Works! %d %x" % (swap,sr) - - if negative(sr) != toggle_r: sr = -sr - ret = self.gfToBytes(sr) - if self.elligator(ret) != self and self.elligator(ret) != -self: - print "WRONG!",[toggle_rotation,toggle_altx,toggle_s] - if self.elligator(ret) == -self and self != -self: print "Negated!",[toggle_rotation,toggle_altx,toggle_s] - rets.append(bytes(ret)) - return rets - - @optimized_version_of("encodeSpec") - def encode(self): - """Encode, optimized version""" - return self.gfToBytes(self.toJacobiQuartic()[0]) - - @classmethod - @optimized_version_of("decodeSpec") - def decode(cls,s): - """Decode, optimized version""" - a,d = cls.a,cls.d - s = cls.bytesToGf(s,mustBePositive=True) - - #if s==0: return cls() - s2 = s^2 - den = 1+a*s2 - num = den^2 - 4*d*s2 - isr = isqrt(num*den^2) - altx = 2*s*isr*den*cls.isoMagic - if negative(altx): isr = -isr - x = 2*s *isr^2*den*num - y = (1-a*s^2) * isr*den - - if cls.cofactor==8 and (negative(x*y*cls.isoMagic) or y==0): - raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y)) - - return cls(x,y) - - @classmethod - def fromJacobiQuartic(cls,s,t,sgn=1): - """Convert point from its Jacobi Quartic representation""" - a,d = cls.a,cls.d - if s==0: return cls() - x = 2*s / (1+a*s^2) - y = (1-a*s^2) / t - return cls(x,sgn*y) - - @optimized_version_of("doubleAndEncodeSpec") - def doubleAndEncode(self): - X,Y,Z,T = self.xyzt() - a,d = self.a,self.d - - if self.cofactor == 8: - # Cofactor 8 version - # Simulate IMAGINE_TWIST because that's how libdecaf does it - X = self.i*X - T = self.i*T - a = -a - d = -d - # TODO: This is only being called for a=-1, so could - # be wrong for a=1 - - e = 2*X*Y - f = Y^2+a*X^2 - g = Y^2-a*X^2 - h = Z^2-d*T^2 - - eim = e*self.isoMagic - inv = 1/(eim*g*f*h) - fh_inv = eim*g*inv*self.i - - if negative(eim*g*fh_inv): - idf = g*self.isoMagic*self.i - bar = f - foo = g - test = eim*f - else: - idf = eim - bar = h - foo = -eim - test = g*h - - if negative(test*fh_inv): bar =- bar - s = idf*(foo+bar)*inv*f*h - - else: - xy = X*Y - h = Z^2-d*T^2 - inv = 1/(xy*h) - if negative(inv*2*xy^2*self.isoMagic): tmp = Y - else: tmp = X - s = tmp^2*h*inv # = X/Y or Y/X, interestingly - - return self.gfToBytes(s,mustBePositive=True) - - @classmethod - def elligatorSpec(cls,r0,fromR=False): - a,d = cls.a,cls.d - if fromR: r = r0 - else: r = cls.qnr * cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)^2 - - den = (d*r-(d-a))*((d-a)*r-d) - if den == 0: return cls() - n1 = (r+1)*(a-2*d)/den - n2 = r*n1 - if is_square(n1): - sgn,s,t = 1, xsqrt(n1), -(r-1)*(a-2*d)^2 / den - 1 - else: - sgn,s,t = -1, -xsqrt(n2), r*(r-1)*(a-2*d)^2 / den - 1 - - return cls.fromJacobiQuartic(s,t) - - @classmethod - @optimized_version_of("elligatorSpec") - def elligator(cls,r0): - a,d = cls.a,cls.d - r0 = cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True) - r = cls.qnr * r0^2 - den = (d*r-(d-a))*((d-a)*r-d) - num = (r+1)*(a-2*d) - - iss,isri = isqrt_i(num*den) - if iss: sgn,twiddle = 1,1 - else: sgn,twiddle = -1,r0*cls.qnr - isri *= twiddle - s = isri*num - t = -sgn*isri*s*(r-1)*(a-2*d)^2 - 1 - if negative(s) == iss: s = -s - return cls.fromJacobiQuartic(s,t) - - def elligatorInverseBruteForce(self): - """Invert Elligator using SAGE's polynomial solver""" - a,d = self.a,self.d - R. = self.F[] - r = self.qnr * r0^2 - den = (d*r-(d-a))*((d-a)*r-d) - n1 = (r+1)*(a-2*d)/den - n2 = r*n1 - ret = set() - for s2,t in [(n1, -(r-1)*(a-2*d)^2 / den - 1), - (n2,r*(r-1)*(a-2*d)^2 / den - 1)]: - x2 = 4*s2/(1+a*s2)^2 - y = (1-a*s2) / t - - selfT = self - for i in xrange(self.cofactor/2): - xT,yT = selfT - polyX = xT^2-x2 - polyY = yT-y - sx = set(r for r,_ in polyX.numerator().roots()) - sy = set(r for r,_ in polyY.numerator().roots()) - ret = ret.union(sx.intersection(sy)) - - selfT = selfT.torque() - - ret = [self.gfToBytes(r) for r in ret] - - for r in ret: - assert self.elligator(r) in [self,-self] - - ret = [r for r in ret if self.elligator(r) == self] - - return ret - -class Ed25519Point(RistrettoPoint): - F = GF(2^255-19) - d = F(-121665/121666) - a = F(-1) - i = sqrt(F(-1)) - mneg = F(1) - qnr = i - magic = isqrt(a*d-1) - cofactor = 8 - encLen = 32 - - @classmethod - def base(cls): - return cls( 15112221349535400772501151409588531511454012693041857206046113283949847762202, 46316835694926478169428394003475163141307993866256225615783033603165251855960 - ) - -class NegEd25519Point(RistrettoPoint): - F = GF(2^255-19) - d = F(121665/121666) - a = F(1) - i = sqrt(F(-1)) - mneg = F(-1) # TODO checkme vs 1-ad or whatever - qnr = i - magic = isqrt(a*d-1) - cofactor = 8 - encLen = 32 - - @classmethod - def base(cls): - y = cls.F(4/5) - x = sqrt((y^2-1)/(cls.d*y^2-cls.a)) - if negative(x): x = -x - return cls(x,y) - -class IsoEd448Point(RistrettoPoint): - F = GF(2^448-2^224-1) - d = F(39082/39081) - a = F(1) - mneg = F(-1) - qnr = -1 - magic = isqrt(a*d-1) - cofactor = 4 - encLen = 56 - - @classmethod - def base(cls): - return cls( # RFC has it wrong - 345397493039729516374008604150537410266655260075183290216406970281645695073672344430481787759340633221708391583424041788924124567700732, - -363419362147803445274661903944002267176820680343659030140745099590306164083365386343198191849338272965044442230921818680526749009182718 - ) - -class TwistedEd448GoldilocksPoint(Decaf_1_1_Point): - F = GF(2^448-2^224-1) - d = F(-39082) - a = F(-1) - qnr = -1 - cofactor = 4 - encLen = 56 - isoMagic = IsoEd448Point.magic - - @classmethod - def base(cls): - return cls.decodeSpec(Ed448GoldilocksPoint.base().encodeSpec()) - -class Ed448GoldilocksPoint(Decaf_1_1_Point): - F = GF(2^448-2^224-1) - d = F(-39081) - a = F(1) - qnr = -1 - cofactor = 4 - encLen = 56 - isoMagic = IsoEd448Point.magic - - @classmethod - def base(cls): - return 2*cls( - 224580040295924300187604334099896036246789641632564134246125461686950415467406032909029192869357953282578032075146446173674602635247710, 298819210078481492676017930443930673437544040154080242095928241372331506189835876003536878655418784733982303233503462500531545062832660 - ) - -class IsoEd25519Point(Decaf_1_1_Point): - # TODO: twisted iso too! - # TODO: twisted iso might have to IMAGINE_TWIST or whatever - F = GF(2^255-19) - d = F(-121665) - a = F(1) - i = sqrt(F(-1)) - qnr = i - magic = isqrt(a*d-1) - cofactor = 8 - encLen = 32 - isoMagic = Ed25519Point.magic - isoA = Ed25519Point.a - - @classmethod - def base(cls): - return cls.decodeSpec(Ed25519Point.base().encode()) - -class TestFailedException(Exception): pass - -def test(cls,n): - print "Testing curve %s" % cls.__name__ - - specials = [1] - ii = cls.F(-1) - while is_square(ii): - specials.append(ii) - ii = sqrt(ii) - specials.append(ii) - for i in specials: - if negative(cls.F(i)): i = -i - i = enc_le(i,cls.encLen) - try: - Q = cls.decode(i) - QE = Q.encode() - if QE != i: - raise TestFailedException("Round trip special %s != %s" % - (binascii.hexlify(QE),binascii.hexlify(i))) - except NotOnCurveException: pass - except InvalidEncodingException: pass - - - P = cls.base() - Q = cls() - for i in xrange(n): - #print binascii.hexlify(Q.encode()) - QE = Q.encode() - QQ = cls.decode(QE) - if QQ != Q: raise TestFailedException("Round trip %s != %s" % (str(QQ),str(Q))) - - # Testing s -> 1/s: encodes -point on cofactor - s = cls.bytesToGf(QE) - if s != 0: - ss = cls.gfToBytes(1/s,mustBePositive=True) - try: - QN = cls.decode(ss) - if cls.cofactor == 8: - raise TestFailedException("1/s shouldnt work for cofactor 8") - if QN != -Q: - raise TestFailedException("s -> 1/s should negate point for cofactor 4") - except InvalidEncodingException as e: - # Should be raised iff cofactor==8 - if cls.cofactor == 4: - raise TestFailedException("s -> 1/s should work for cofactor 4") - - QT = Q - for h in xrange(cls.cofactor): - QT = QT.torque() - if QT.encode() != QE: - raise TestFailedException("Can't torque %s,%d" % (str(Q),h+1)) - - Q0 = Q + P - if Q0 == Q: raise TestFailedException("Addition doesn't work") - if Q0-P != Q: raise TestFailedException("Subtraction doesn't work") - - r = randint(1,1000) - Q1 = Q0*r - Q2 = Q0*(r+1) - if Q1 + Q0 != Q2: raise TestFailedException("Scalarmul doesn't work") - Q = Q1 - -def testElligator(cls,n): - print "Testing elligator on %s" % cls.__name__ - for i in xrange(n): - r = randombytes(cls.encLen) - P = cls.elligator(r) - if hasattr(P,"invertElligator"): - iv = P.invertElligator() - modr = bytes(cls.gfToBytes(cls.bytesToGf(r,mustBeProper=False,maskHiBits=True))) - iv2 = P.torque().invertElligator() - if modr not in iv: print "Failed to invert Elligator!" - if len(iv) != len(set(iv)): - print "Elligator inverses not unique!", len(set(iv)), len(iv) - if iv != iv2: - print "Elligator is untorqueable!" - #print [binascii.hexlify(j) for j in iv] - #print [binascii.hexlify(j) for j in iv2] - #break - else: - pass # TODO - -def gangtest(classes,n): - print "Gang test",[cls.__name__ for cls in classes] - specials = [1] - ii = classes[0].F(-1) - while is_square(ii): - specials.append(ii) - ii = sqrt(ii) - specials.append(ii) - - for i in xrange(n): - rets = [bytes((cls.base()*i).encode()) for cls in classes] - if len(set(rets)) != 1: - print "Divergence in encode at %d" % i - for c,ret in zip(classes,rets): - print c,binascii.hexlify(ret) - print - - if i < len(specials): r0 = enc_le(specials[i],classes[0].encLen) - else: r0 = randombytes(classes[0].encLen) - - rets = [bytes((cls.elligator(r0)*i).encode()) for cls in classes] - if len(set(rets)) != 1: - print "Divergence in elligator at %d" % i - for c,ret in zip(classes,rets): - print c,binascii.hexlify(ret) - print - -def testDoubleAndEncode(cls,n): - print "Testing doubleAndEncode on %s" % cls.__name__ - for i in xrange(n): - r1 = randombytes(cls.encLen) - r2 = randombytes(cls.encLen) - u = cls.elligator(r1) + cls.elligator(r2) - u.doubleAndEncode() - -testDoubleAndEncode(Ed25519Point,100) -testDoubleAndEncode(NegEd25519Point,100) -testDoubleAndEncode(IsoEd25519Point,100) -testDoubleAndEncode(IsoEd448Point,100) -testDoubleAndEncode(TwistedEd448GoldilocksPoint,100) -#test(Ed25519Point,100) -#test(NegEd25519Point,100) -#test(IsoEd25519Point,100) -#test(IsoEd448Point,100) -#test(TwistedEd448GoldilocksPoint,100) -#test(Ed448GoldilocksPoint,100) -#testElligator(Ed25519Point,100) -#testElligator(NegEd25519Point,100) -#testElligator(IsoEd25519Point,100) -#testElligator(IsoEd448Point,100) -#testElligator(Ed448GoldilocksPoint,100) -#testElligator(TwistedEd448GoldilocksPoint,100) -#gangtest([IsoEd448Point,TwistedEd448GoldilocksPoint,Ed448GoldilocksPoint],100) -#gangtest([Ed25519Point,IsoEd25519Point],100)