diff --git a/src/aztec/gpu/headers/affine.cuh b/src/aztec/gpu/headers/affine.cuh new file mode 100644 index 00000000..8bf0bc75 --- /dev/null +++ b/src/aztec/gpu/headers/affine.cuh @@ -0,0 +1,24 @@ +#pragma once + +#include "./fields.cuh" + +template +class Affine +{ +public: + FF x; + FF y; + + static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; } + + friend HOST_DEVICE_INLINE bool operator==(const Affine& xs, const Affine& ys) + { + return (xs.x == ys.x) && (xs.y == ys.y); + } + + friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Affine& point) + { + os << "x: " << point.x << "; y: " << point.y; + return os; + } +}; diff --git a/src/aztec/gpu/headers/curve_config.cuh b/src/aztec/gpu/headers/curve_config.cuh new file mode 100644 index 00000000..1e7b5bbb --- /dev/null +++ b/src/aztec/gpu/headers/curve_config.cuh @@ -0,0 +1,25 @@ +#pragma once + +#include "./fields.cuh" +#include "./projective.cuh" +#if defined(G2_DEFINED) +#include "../../primitives/extension_field.cuh" +#endif + +#include "params.cuh" + +namespace BN254 { + typedef Field scalar_field_t; + typedef scalar_field_t scalar_t; + typedef Field point_field_t; + static constexpr point_field_t b = point_field_t{PARAMS_BN254::weierstrass_b}; + typedef Projective projective_t; + typedef Affine affine_t; +#if defined(G2_DEFINED) + typedef ExtensionField g2_point_field_t; + static constexpr g2_point_field_t b_g2 = g2_point_field_t{ + point_field_t{PARAMS_BN254::weierstrass_b_g2_re}, point_field_t{PARAMS_BN254::weierstrass_b_g2_im}}; + typedef Projective g2_projective_t; + typedef Affine g2_affine_t; +#endif +} // namespace BN254 \ No newline at end of file diff --git a/src/aztec/gpu/headers/extension_field.cuh b/src/aztec/gpu/headers/extension_field.cuh new file mode 100644 index 00000000..3189469a --- /dev/null +++ b/src/aztec/gpu/headers/extension_field.cuh @@ -0,0 +1,158 @@ +#pragma once + +#include "field.cuh" + +#define HOST_INLINE __host__ __forceinline__ +#define DEVICE_INLINE __device__ __forceinline__ +#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__ + +template +class ExtensionField +{ +private: + typedef typename Field::Wide FWide; + + struct ExtensionWide { + FWide real; + FWide imaginary; + + friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys) + { + return ExtensionWide{xs.real + ys.real, xs.imaginary + ys.imaginary}; + } + + friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys) + { + return ExtensionWide{xs.real - ys.real, xs.imaginary - ys.imaginary}; + } + }; + +public: + typedef Field FF; + static constexpr unsigned TLC = 2 * CONFIG::limbs_count; + + FF real; + FF imaginary; + + static constexpr HOST_DEVICE_INLINE ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; } + + static constexpr HOST_DEVICE_INLINE ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; } + + static constexpr HOST_DEVICE_INLINE ExtensionField generator_x() + { + return ExtensionField{FF{CONFIG::g2_gen_x_re}, FF{CONFIG::g2_gen_x_im}}; + } + + static constexpr HOST_DEVICE_INLINE ExtensionField generator_y() + { + return ExtensionField{FF{CONFIG::g2_gen_y_re}, FF{CONFIG::g2_gen_y_im}}; + } + + static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; } + + template + static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs) + { + return ExtensionField{FF::sub_modulus(&xs.real), FF::sub_modulus(&xs.imaginary)}; + } + + friend std::ostream& operator<<(std::ostream& os, const ExtensionField& xs) + { + os << "{ Real: " << xs.real << " }; { Imaginary: " << xs.imaginary << " }"; + return os; + } + + friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys) + { + return ExtensionField{xs.real + ys.real, xs.imaginary + ys.imaginary}; + } + + friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys) + { + return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary}; + } + + template + static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys) + { + FWide real_prod = FF::mul_wide(xs.real, ys.real); + FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary); + FWide prod_of_sums = FF::mul_wide(xs.real + xs.imaginary, ys.real + ys.imaginary); + FWide i_sq_times_im = FF::template mul_unsigned(imaginary_prod); + i_sq_times_im = CONFIG::i_squared_is_negative ? FWide::neg(i_sq_times_im) : i_sq_times_im; + return ExtensionWide{real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod}; + } + + template + static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs) + { + return ExtensionField{ + FF::template reduce(xs.real), FF::template reduce(xs.imaginary)}; + } + + friend HOST_DEVICE_INLINE ExtensionField operator*(const ExtensionField& xs, const ExtensionField& ys) + { + ExtensionWide xy = mul_wide(xs, ys); + return reduce(xy); + } + + friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys) + { + return (xs.real == ys.real) && (xs.imaginary == ys.imaginary); + } + + friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); } + + template + static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField& xs) + { + static constexpr FF mul_real = multiplier.real; + static constexpr FF mul_imaginary = multiplier.imaginary; + const FF xs_real = xs.real; + const FF xs_imaginary = xs.imaginary; + FF real_prod = FF::template mul_const(xs_real); + FF imaginary_prod = FF::template mul_const(xs_imaginary); + FF re_im = FF::template mul_const(xs_imaginary); + FF im_re = FF::template mul_const(xs_real); + FF i_sq_times_im = FF::template mul_unsigned(imaginary_prod); + i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im; + return ExtensionField{real_prod + i_sq_times_im, re_im + im_re}; + } + + template + static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs) + { + return {FF::template mul_unsigned(xs.real), FF::template mul_unsigned(xs.imaginary)}; + } + + template + static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs) + { + // TODO: change to a more efficient squaring + return mul_wide(xs, xs); + } + + template + static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs) + { + // TODO: change to a more efficient squaring + return xs * xs; + } + + template + static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs) + { + return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)}; + } + + // inverse assumes that xs is nonzero + static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs) + { + ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)}; + FF i_sq_times_im = FF::template mul_unsigned(FF::sqr(xs.imaginary)); + i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im; + // TODO: wide here + FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im; + return xs_conjugate * ExtensionField{FF::inverse(xs_norm_squared), FF::zero()}; + } +}; diff --git a/src/aztec/gpu/headers/fields.cuh b/src/aztec/gpu/headers/fields.cuh new file mode 100644 index 00000000..4cacc117 --- /dev/null +++ b/src/aztec/gpu/headers/fields.cuh @@ -0,0 +1,942 @@ +#pragma once + +#include "../utils/host_math.cuh" +#include "../utils/ptx.cuh" +#include "../utils/storage.cuh" +#include +#include +#include +#include +#include + +#define HOST_INLINE __host__ __forceinline__ +#define DEVICE_INLINE __device__ __forceinline__ +#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__ + +template +class Field +{ +public: + static constexpr unsigned TLC = CONFIG::limbs_count; + static constexpr unsigned NBITS = CONFIG::modulus_bit_count; + + static constexpr HOST_DEVICE_INLINE Field zero() { return Field{CONFIG::zero}; } + + static constexpr HOST_DEVICE_INLINE Field one() { return Field{CONFIG::one}; } + + static constexpr HOST_DEVICE_INLINE Field from(uint32_t value) + { + storage scalar; + scalar.limbs[0] = value; + for (int i = 1; i < TLC; i++) { + scalar.limbs[i] = 0; + } + return Field{scalar}; + } + + static constexpr HOST_DEVICE_INLINE Field generator_x() { return Field{CONFIG::g1_gen_x}; } + + static constexpr HOST_DEVICE_INLINE Field generator_y() { return Field{CONFIG::g1_gen_y}; } + + static HOST_INLINE Field omega(uint32_t logn) + { + if (logn == 0) { return Field{CONFIG::one}; } + + if (logn > CONFIG::omegas_count) { throw std::invalid_argument("Field: Invalid omega index"); } + + storage_array const omega = CONFIG::omega; + return Field{omega.storages[logn - 1]}; + } + + static HOST_INLINE Field omega_inv(uint32_t logn) + { + if (logn == 0) { return Field{CONFIG::one}; } + + if (logn > CONFIG::omegas_count) { throw std::invalid_argument("Field: Invalid omega_inv index"); } + + storage_array const omega_inv = CONFIG::omega_inv; + return Field{omega_inv.storages[logn - 1]}; + } + + static HOST_INLINE Field inv_log_size(uint32_t logn) + { + if (logn == 0) { return Field{CONFIG::one}; } + + if (logn > CONFIG::omegas_count) { throw std::invalid_argument("Field: Invalid inv index"); } + storage_array const inv = CONFIG::inv; + return Field{inv.storages[logn - 1]}; + } + + static constexpr HOST_DEVICE_INLINE Field modulus() { return Field{CONFIG::modulus}; } + + static constexpr HOST_DEVICE_INLINE Field montgomery_r() { return Field{CONFIG::montgomery_r}; } + + static constexpr HOST_DEVICE_INLINE Field montgomery_r_inv() { return Field{CONFIG::montgomery_r_inv}; } + + // private: + typedef storage ff_storage; + typedef storage<2 * TLC> ff_wide_storage; + + static constexpr unsigned slack_bits = 32 * TLC - NBITS; + + struct Wide { + ff_wide_storage limbs_storage; + + static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs) + { + Field out{}; +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (unsigned i = 0; i < TLC; i++) + out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i]; + return out; + } + + static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs) + { + Field out{}; +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (unsigned i = 0; i < TLC; i++) { +#ifdef __CUDA_ARCH__ + out.limbs_storage.limbs[i] = + __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], slack_bits); +#else + out.limbs_storage.limbs[i] = + (xs.limbs_storage.limbs[i + TLC] << slack_bits) + (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - slack_bits)); +#endif + } + return out; + } + + template + static constexpr HOST_DEVICE_INLINE Wide sub_modulus_squared(const Wide& xs) + { + if (REDUCTION_SIZE == 0) return xs; + const ff_wide_storage modulus = get_modulus_squared(); + Wide rs = {}; + return sub_limbs(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs; + } + + template + static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs) + { + const ff_wide_storage modulus = get_modulus_squared(); + Wide rs = {}; + sub_limbs(modulus, xs.limbs_storage, rs.limbs_storage); + return rs; + } + + friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys) + { + Wide rs = {}; + add_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); + return sub_modulus_squared<1>(rs); + } + + friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys) + { + Wide rs = {}; + uint32_t carry = sub_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); + if (carry == 0) return rs; + const ff_wide_storage modulus = get_modulus_squared<1>(); + add_limbs(rs.limbs_storage, modulus, rs.limbs_storage); + return rs; + } + }; + + // return modulus + template + static constexpr HOST_DEVICE_INLINE ff_storage get_modulus() + { + switch (MULTIPLIER) { + case 1: + return CONFIG::modulus; + case 2: + return CONFIG::modulus_2; + case 4: + return CONFIG::modulus_4; + default: + return {}; + } + } + + template + static constexpr HOST_DEVICE_INLINE ff_wide_storage modulus_wide() + { + return CONFIG::modulus_wide; + } + + // return m + static constexpr HOST_DEVICE_INLINE ff_storage get_m() { return CONFIG::m; } + + // return modulus^2, helpful for ab +/- cd + template + static constexpr HOST_DEVICE_INLINE ff_wide_storage get_modulus_squared() + { + switch (MULTIPLIER) { + case 1: + return CONFIG::modulus_squared; + case 2: + return CONFIG::modulus_squared_2; + case 4: + return CONFIG::modulus_squared_4; + default: + return {}; + } + } + + // add or subtract limbs + template + static constexpr DEVICE_INLINE uint32_t + add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs) + { + const uint32_t* x = xs.limbs; + const uint32_t* y = ys.limbs; + uint32_t* r = rs.limbs; + r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]); +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (unsigned i = 1; i < (CARRY_OUT ? TLC : TLC - 1); i++) + r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]); + if (!CARRY_OUT) { + r[TLC - 1] = SUBTRACT ? ptx::subc(x[TLC - 1], y[TLC - 1]) : ptx::addc(x[TLC - 1], y[TLC - 1]); + return 0; + } + return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0); + } + + template + static constexpr DEVICE_INLINE uint32_t + add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs) + { + const uint32_t* x = xs.limbs; + const uint32_t* y = ys.limbs; + uint32_t* r = rs.limbs; + r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]); +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (unsigned i = 1; i < (CARRY_OUT ? 2 * TLC : 2 * TLC - 1); i++) + r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]); + if (!CARRY_OUT) { + r[2 * TLC - 1] = SUBTRACT ? ptx::subc(x[2 * TLC - 1], y[2 * TLC - 1]) : ptx::addc(x[2 * TLC - 1], y[2 * TLC - 1]); + return 0; + } + return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0); + } + + template + static constexpr HOST_INLINE uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs) + { + const uint32_t* x = xs.limbs; + const uint32_t* y = ys.limbs; + uint32_t* r = rs.limbs; + uint32_t carry = 0; + host_math::carry_chain chain; + for (unsigned i = 0; i < TLC; i++) + r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry); + return CARRY_OUT ? carry : 0; + } + + template + static constexpr HOST_INLINE uint32_t + add_sub_limbs_host(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs) + { + const uint32_t* x = xs.limbs; + const uint32_t* y = ys.limbs; + uint32_t* r = rs.limbs; + uint32_t carry = 0; + host_math::carry_chain<2 * TLC, false, CARRY_OUT> chain; + for (unsigned i = 0; i < 2 * TLC; i++) + r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry); + return CARRY_OUT ? carry : 0; + } + + static constexpr HOST_INLINE uint32_t + sub_limbs_partial_host(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) + { + uint32_t carry = 0; + host_math::carry_chain<2 * TLC, false, true> chain; + for (unsigned i = 0; i < num_limbs; i++) + r[i] = chain.sub(x[i], y[i], carry); + return carry; + } + + template + static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T& xs, const T& ys, T& rs) + { +#ifdef __CUDA_ARCH__ + return add_sub_limbs_device(xs, ys, rs); +#else + return add_sub_limbs_host(xs, ys, rs); +#endif + } + + template + static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs(const T& xs, const T& ys, T& rs) + { +#ifdef __CUDA_ARCH__ + return add_sub_limbs_device(xs, ys, rs); +#else + return add_sub_limbs_host(xs, ys, rs); +#endif + } + + static DEVICE_INLINE void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC) + { +#pragma unroll + for (size_t i = 0; i < n; i += 2) { + acc[i] = ptx::mul_lo(a[i], bi); + acc[i + 1] = ptx::mul_hi(a[i], bi); + } + } + + static DEVICE_INLINE void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0) + { +#pragma unroll + for (size_t i = start_i; i < n; i += 2) { + acc[i] = ptx::mul_lo(a[i], bi); + acc[i + 1] = ptx::mul_hi(a[i], bi); + } + } + + static DEVICE_INLINE void cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC) + { + // multiply scalar by vector + // acc = acc + bi*A[::2] + acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]); + acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]); +#pragma unroll + for (size_t i = 2; i < n; i += 2) { + acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]); + acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]); + } + } + + static DEVICE_INLINE void + cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0) + { + // multiply scalar by vector + // acc = acc + bi*A[::2] + acc[a_start_idx] = ptx::mad_lo_cc(a[a_start_idx], bi, acc[a_start_idx]); + acc[a_start_idx + 1] = ptx::madc_hi_cc(a[a_start_idx], bi, acc[a_start_idx + 1]); +#pragma unroll + for (size_t i = a_start_idx + 2; i < n; i += 2) { + acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]); + acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]); + } + } + + static DEVICE_INLINE void mad_row(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC) + { + // odd = odd + bi*A + // even = even + bi*A + cmad_n(odd, a + 1, bi, n - 2); + odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0); + odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0); + cmad_n(even, a, bi, n); + odd[n - 1] = ptx::addc(odd[n - 1], 0); + } + + static DEVICE_INLINE void + mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0) + { + // odd = odd + bi*A + // even = even + bi*A + cmad_n_msb(odd, a + 1, bi, n - 2, a_start_idx - 1); + odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0); + odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0); + cmad_n_msb(even, a, bi, n, a_start_idx); + odd[n - 1] = ptx::addc(odd[n - 1], 0); + } + + static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { + const uint32_t* a = as.limbs; + const uint32_t* b = bs.limbs; + uint32_t* even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC - 2]; + mul_n(even, a, b[0]); + mul_n(odd, a + 1, b[0]); + mad_row(&even[2], &odd[0], a, b[1]); + size_t i; +#pragma unroll + for (i = 2; i < TLC - 1; i += 2) { + mad_row(&odd[i], &even[i], a, b[i]); + mad_row(&even[i + 2], &odd[i], a, b[i + 1]); + } + // merge |even| and |odd| + even[1] = ptx::add_cc(even[1], odd[0]); + for (i = 1; i < 2 * TLC - 2; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + + static DEVICE_INLINE void mult_no_carry(uint32_t a, uint32_t b, uint32_t* r) + { + r[0] = ptx::mul_lo(a, b); + r[1] = ptx::mul_hi(a, b); + } + + static DEVICE_INLINE void ingo_multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { + const uint32_t* a = as.limbs; + const uint32_t* b = bs.limbs; + uint32_t* r = rs.limbs; + uint32_t i, j; + uint32_t* even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC]; + for (uint32_t i = 0; i < 2 * TLC; i++) { + even[i] = 0; + odd[i] = 0; + } + // first row special case, no carry in no carry out. split to non parts, even and odd. + for (i = 0; i < TLC - 1; i += 2) { + mult_no_carry(b[0], a[i], &even[i]); + mult_no_carry(b[0], a[i + 1], &odd[i]); + } + + // doing two rows at one loop + for (i = 1; i < TLC - 1; i += 2) { + // odd bi's + // multiply accumulate even part of new row with odd part prev row (needs a carry) + // // j = 0, no carry in, only carry out + odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); + odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); + // for loop carry in carry out + for (j = 2; j < TLC; j += 2) // 2, 4, 6 + { + odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); + odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); + } + odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + + // multiply accumulate odd part of new row with even part prev row (doesnt need a carry) + // j = 1, no carry in, only carry out + even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); + even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); + // for loop carry in carry out + for (j = 3; j < TLC; j += 2) { + even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); + even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + } + + // even bi's + // multiply accumulate even part of new row with even part of prev row // needs a carry + // j = 0, no carry in, only carry out + even[i + 1] = ptx::mad_lo_cc(a[0], b[i + 1], even[i + 1]); + even[i + 2] = ptx::madc_hi_cc(a[0], b[i + 1], even[i + 2]); + // for loop, carry in, carry out. + for (j = 2; j < TLC; j += 2) { + even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]); + even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]); + } + even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry + + // multiply accumulate odd part of new row with odd part of prev row + // j = 1, no carry in, only carry out + odd[i + 1] = ptx::mad_lo_cc(a[1], b[i + 1], odd[i + 1]); + odd[i + 2] = ptx::madc_hi_cc(a[1], b[i + 1], odd[i + 2]); + // for loop, carry in, carry out. + for (j = 3; j < TLC; j += 2) { + odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]); + odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]); + } + } + + odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); + odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); + // for loop carry in carry out + for (j = 2; j < TLC; j += 2) { + odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); + odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); + } + odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + + // multiply accumulate odd part of new row with even part prev row + // j = 1, no carry in, only carry out + even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); + even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); + // for loop carry in carry out + for (j = 3; j < TLC; j += 2) { + even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); + even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + } + + // add even and odd parts + even[1] = ptx::add_cc(even[1], odd[0]); + for (i = 1; i < 2 * TLC - 2; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + + static DEVICE_INLINE void + ingo_msb_multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { + const uint32_t* a = as.limbs; + const uint32_t* b = bs.limbs; + uint32_t* r = rs.limbs; + uint32_t i, j; + uint32_t* even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC]; + for (uint32_t i = 0; i < 2 * TLC; i++) { + even[i] = 0; + odd[i] = 0; + } + // only last element from first row. + mult_no_carry(b[0], a[TLC - 1], &odd[TLC - 2]); + +// doing two rows at one loop +#pragma unroll + for (i = 1; i < TLC - 1; i += 2) { + const uint32_t first_active_j = TLC - 1 - i; + const uint32_t first_active_j_odd = first_active_j + (1 - (first_active_j % 2)); + const uint32_t first_active_j_even = first_active_j + first_active_j % 2; + // odd bi's + // multiply accumulate even part of new row with odd part prev row (needs a carry) + // j = 0, no carry in, only carry out + odd[first_active_j_even + i - 1] = ptx::mad_lo_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i - 1]); + odd[first_active_j_even + i] = ptx::madc_hi_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i]); +// for loop carry in carry out +#pragma unroll + for (j = first_active_j_even + 2; j < TLC; j += 2) { + odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); + odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); + } + odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + + // multiply accumulate odd part of new row with even part prev row (doesnt need a carry) + // j = 1, no carry in, only carry out + even[i + first_active_j_odd] = ptx::mad_lo_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd]); + even[i + first_active_j_odd + 1] = ptx::madc_hi_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd + 1]); +// for loop carry in carry out +#pragma unroll + for (j = first_active_j_odd + 2; j < TLC; j += 2) { + even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); + even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + } + + // even bi's + uint32_t const first_active_j1 = TLC - 1 - (i + 1); + uint32_t const first_active_j_odd1 = first_active_j1 + (1 - (first_active_j1 % 2)); + uint32_t const first_active_j_even1 = first_active_j1 + first_active_j1 % 2; + // multiply accumulate even part of new row with even part of prev row // needs a carry + // j = 0, no carry in, only carry out + even[first_active_j_even1 + i + 1] = + ptx::mad_lo_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 1]); + even[first_active_j_even1 + i + 2] = + ptx::madc_hi_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 2]); +// for loop, carry in, carry out. +#pragma unroll + for (j = first_active_j_even1 + 2; j < TLC; j += 2) { + even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]); + even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]); + } + even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry + + // multiply accumulate odd part of new row with odd part of prev row + // j = 1, no carry in, only carry out + odd[first_active_j_odd1 + i] = ptx::mad_lo_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i]); + odd[first_active_j_odd1 + i + 1] = + ptx::madc_hi_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i + 1]); +// for loop, carry in, carry out. +#pragma unroll + for (j = first_active_j_odd1 + 2; j < TLC; j += 2) { + odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]); + odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]); + } + } + + // last round, i = TLC - 1 + odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]); + odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]); +// for loop carry in carry out +#pragma unroll + for (j = 2; j < TLC; j += 2) { + odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]); + odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]); + } + odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry + + // multiply accumulate odd part of new row with even part prev row + // j = 1, no carry in, only carry out + even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]); + even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]); +// for loop carry in carry out +#pragma unroll + for (j = 3; j < TLC; j += 2) { + even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]); + even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]); + } + + // add even and odd parts + even[1] = ptx::add_cc(even[1], odd[0]); +#pragma unroll + for (i = 1; i < 2 * TLC - 2; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + + static DEVICE_INLINE void multiply_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { + // r = a * b is correcrt for the first TLC + 1 digits. (not computing from TLC + 1 to 2*TLC - 2). + const uint32_t* a = as.limbs; + const uint32_t* b = bs.limbs; + uint32_t* even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC - 2]; + mul_n(even, a, b[0]); + mul_n(odd, a + 1, b[0]); + mad_row(&even[2], &odd[0], a, b[1]); + size_t i; +#pragma unroll + for (i = 2; i < TLC - 1; i += 2) { + mad_row(&odd[i], &even[i], a, b[i], TLC - i + 2); + mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC - i + 2); + } + + // merge |even| and |odd| + even[1] = ptx::add_cc(even[1], odd[0]); + for (i = 1; i < TLC + 1; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + + static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { + const uint32_t* a = as.limbs; + const uint32_t* b = bs.limbs; + uint32_t* even = rs.limbs; + __align__(8) uint32_t odd[2 * TLC - 2]; + for (int i = 0; i < 2 * TLC - 1; i++) { + even[i] = 0; + odd[i] = 0; + } + uint32_t min_indexes_sum = TLC - 1; + // only diagonal + mul_n_msb(even, a, b[0], TLC, min_indexes_sum); + mul_n_msb(odd, a + 1, b[0], TLC, min_indexes_sum - 1); + mad_row_msb(&even[2], &odd[0], a, b[1], TLC, min_indexes_sum - 1); + size_t i; +#pragma unroll + for (i = 2; i < TLC - 1; i += 2) { + mad_row(&odd[i], &even[i], a, b[i]); + mad_row(&even[i + 2], &odd[i], a, b[i + 1]); + } + // merge |even| and |odd| + even[1] = ptx::add_cc(even[1], odd[0]); + for (i = 1; i < 2 * TLC - 2; i++) + even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]); + even[i + 1] = ptx::addc(even[i + 1], 0); + } + + static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { + const uint32_t* a = as.limbs; + const uint32_t* b = bs.limbs; + uint32_t* r = rs.limbs; + for (unsigned i = 0; i < TLC; i++) { + uint32_t carry = 0; + for (unsigned j = 0; j < TLC; j++) + r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry); + r[TLC + i] = carry; + } + } + + static HOST_DEVICE_INLINE void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { +#ifdef __CUDA_ARCH__ + return multiply_raw_device(as, bs, rs); +#else + return multiply_raw_host(as, bs, rs); +#endif + } + + static HOST_DEVICE_INLINE void multiply_raw_lsb(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { +#ifdef __CUDA_ARCH__ + return multiply_lsb_raw_device(as, bs, rs); +#else + return multiply_raw_host(as, bs, rs); +#endif + } + + static HOST_DEVICE_INLINE void multiply_raw_msb(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs) + { +#ifdef __CUDA_ARCH__ + return multiply_raw_device(as, bs, rs); +#else + return multiply_raw_host(as, bs, rs); +#endif + } + +public: + ff_storage limbs_storage; + + HOST_DEVICE_INLINE uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; } + + HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) + { + const uint32_t limb_lsb_idx = (digit_num * digit_width) / 32; + const uint32_t shift_bits = (digit_num * digit_width) % 32; + unsigned rv = limbs_storage.limbs[limb_lsb_idx] >> shift_bits; + if ((shift_bits + digit_width > 32) && (limb_lsb_idx + 1 < TLC)) { + rv += limbs_storage.limbs[limb_lsb_idx + 1] << (32 - shift_bits); + } + rv &= ((1 << digit_width) - 1); + return rv; + } + + static HOST_INLINE Field rand_host() + { + std::random_device rd; + std::mt19937_64 generator(rd()); + std::uniform_int_distribution distribution; + Field value{}; + for (unsigned i = 0; i < TLC; i++) + value.limbs_storage.limbs[i] = distribution(generator); + while (lt(modulus(), value)) + value = value - modulus(); + return value; + } + + template + static constexpr HOST_DEVICE_INLINE Field sub_modulus(const Field& xs) + { + if (REDUCTION_SIZE == 0) return xs; + const ff_storage modulus = get_modulus(); + Field rs = {}; + return sub_limbs(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs; + } + + friend std::ostream& operator<<(std::ostream& os, const Field& xs) + { + std::stringstream hex_string; + hex_string << std::hex << std::setfill('0'); + + for (int i = 0; i < TLC; i++) { + hex_string << std::setw(8) << xs.limbs_storage.limbs[i]; + } + + os << "0x" << hex_string.str(); + return os; + } + + friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys) + { + Field rs = {}; + add_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); + return sub_modulus<1>(rs); + } + + friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys) + { + Field rs = {}; + uint32_t carry = sub_limbs(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); + if (carry == 0) return rs; + const ff_storage modulus = get_modulus<1>(); + add_limbs(rs.limbs_storage, modulus, rs.limbs_storage); + return rs; + } + + template + static constexpr HOST_DEVICE_INLINE Wide mul_wide(const Field& xs, const Field& ys) + { + Wide rs = {}; + multiply_raw(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage); + return rs; + } + + static constexpr DEVICE_INLINE uint32_t + sub_limbs_partial_device(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) + { + r[0] = ptx::sub_cc(x[0], y[0]); +#pragma unroll + for (unsigned i = 1; i < num_limbs; i++) + r[i] = ptx::subc_cc(x[i], y[i]); + return ptx::subc(0, 0); + } + + static constexpr HOST_DEVICE_INLINE uint32_t + sub_limbs_partial(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) + { +#ifdef __CUDA_ARCH__ + return sub_limbs_partial_device(x, y, r, num_limbs); +#else + return sub_limbs_partial_host(x, y, r, num_limbs); +#endif + } + + template + static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs) + { + Field xs_hi = Wide::get_higher_with_slack(xs); // xy << slack_bits + Wide l = {}; + multiply_raw_msb(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult + Field l_hi = Wide::get_higher_with_slack(l); + Wide lp = {}; + multiply_raw_lsb(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); // LSB mult + Wide r_wide = xs - lp; + Wide r_wide_reduced = {}; + for (unsigned i = 0; i < TLC + 1; i++) { + uint32_t carry = sub_limbs_partial( + r_wide.limbs_storage.limbs, modulus_wide().limbs, r_wide_reduced.limbs_storage.limbs, TLC + 1); + if (carry == 0) // continue to reduce + r_wide = r_wide_reduced; + else // done + break; + } + + // number of wrap around is bounded by TLC + 1 times. + Field r = Wide::get_lower(r_wide); + return r; + } + + friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys) + { + Wide xy = mul_wide(xs, ys); // full mult + return reduce(xy); + } + + friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys) + { +#ifdef __CUDA_ARCH__ + const uint32_t* x = xs.limbs_storage.limbs; + const uint32_t* y = ys.limbs_storage.limbs; + uint32_t limbs_or = x[0] ^ y[0]; +#pragma unroll + for (unsigned i = 1; i < TLC; i++) + limbs_or |= x[i] ^ y[i]; + return limbs_or == 0; +#else + for (unsigned i = 0; i < TLC; i++) + if (xs.limbs_storage.limbs[i] != ys.limbs_storage.limbs[i]) return false; + return true; +#endif + } + + friend HOST_DEVICE_INLINE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); } + + template + static HOST_DEVICE_INLINE Field mul_const(const Field& xs) + { + Field mul = multiplier; + static bool is_u32 = true; +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (unsigned i = 1; i < TLC; i++) + is_u32 &= (mul.limbs_storage.limbs[i] == 0); + + if (is_u32) return mul_unsigned(xs); + return mul * xs; + } + + template + static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs) + { + T rs = {}; + T temp = xs; + bool is_zero = true; +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (unsigned i = 0; i < 32; i++) { + if (mutliplier & (1 << i)) { + rs = is_zero ? temp : (rs + temp); + is_zero = false; + } + if (mutliplier & ((1 << (31 - i) - 1) << (i + 1))) break; + temp = temp + temp; + } + return rs; + } + + template + static constexpr HOST_DEVICE_INLINE Wide sqr_wide(const Field& xs) + { + // TODO: change to a more efficient squaring + return mul_wide(xs, xs); + } + + template + static constexpr HOST_DEVICE_INLINE Field sqr(const Field& xs) + { + // TODO: change to a more efficient squaring + return xs * xs; + } + + template + static constexpr HOST_DEVICE_INLINE Field neg(const Field& xs) + { + const ff_storage modulus = get_modulus(); + Field rs = {}; + sub_limbs(modulus, xs.limbs_storage, rs.limbs_storage); + return rs; + } + + template + static constexpr HOST_DEVICE_INLINE Field div2(const Field& xs) + { + const uint32_t* x = xs.limbs_storage.limbs; + Field rs = {}; + uint32_t* r = rs.limbs_storage.limbs; +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (unsigned i = 0; i < TLC - 1; i++) { +#ifdef __CUDA_ARCH__ + r[i] = __funnelshift_rc(x[i], x[i + 1], 1); +#else + r[i] = (x[i] >> 1) | (x[i + 1] << 31); +#endif + } + r[TLC - 1] = x[TLC - 1] >> 1; + return sub_modulus(rs); + } + + static constexpr HOST_DEVICE_INLINE bool lt(const Field& xs, const Field& ys) + { + ff_storage dummy = {}; + uint32_t carry = sub_limbs(xs.limbs_storage, ys.limbs_storage, dummy); + return carry; + } + + static constexpr HOST_DEVICE_INLINE bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; } + + static constexpr HOST_DEVICE_INLINE bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; } + + // inverse assumes that xs is nonzero + static constexpr HOST_DEVICE_INLINE Field inverse(const Field& xs) + { + constexpr Field one = Field{CONFIG::one}; + constexpr ff_storage modulus = CONFIG::modulus; + Field u = xs; + Field v = Field{modulus}; + Field b = one; + Field c = {}; + while (!(u == one) && !(v == one)) { + while (is_even(u)) { + u = div2(u); + if (is_odd(b)) add_limbs(b.limbs_storage, modulus, b.limbs_storage); + b = div2(b); + } + while (is_even(v)) { + v = div2(v); + if (is_odd(c)) add_limbs(c.limbs_storage, modulus, c.limbs_storage); + c = div2(c); + } + if (lt(v, u)) { + u = u - v; + b = b - c; + } else { + v = v - u; + c = c - b; + } + } + return (u == one) ? b : c; + } +}; diff --git a/src/aztec/gpu/headers/msm.h b/src/aztec/gpu/headers/msm.h new file mode 100644 index 00000000..88c4bf34 --- /dev/null +++ b/src/aztec/gpu/headers/msm.h @@ -0,0 +1,94 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +#include +// msm.h + +#ifndef _BN254_MSM_H +#define _BN254_MSM_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BN254 projective and affine structs +typedef struct BN254_projective_t BN254_projective_t; +typedef struct BN254_g2_projective_t BN254_g2_projective_t; +typedef struct BN254_affine_t BN254_affine_t; +typedef struct BN254_g2_affine_t BN254_g2_affine_t; +typedef struct BN254_scalar_t BN254_scalar_t; +typedef cudaStream_t CudaStream_t; + +int msm_cuda_bn254( + BN254_projective_t* out, BN254_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id); + +int msm_batch_cuda_bn254( + BN254_projective_t* out, + BN254_affine_t* points, + BN254_scalar_t* scalars, + size_t batch_size, + size_t msm_size, + size_t device_id); + +int commit_cuda_bn254( + BN254_projective_t* d_out, + BN254_scalar_t* d_scalars, + BN254_affine_t* d_points, + size_t count, + unsigned large_bucket_factor, + size_t device_id); + +int commit_batch_cuda_bn254( + BN254_projective_t* d_out, + BN254_scalar_t* d_scalars, + BN254_affine_t* d_points, + size_t count, + size_t batch_size, + size_t device_id); + +int msm_g2_cuda_bn254( + BN254_g2_projective_t* out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id); +int msm_batch_g2_cuda_bn254( + BN254_g2_projective_t* out, + BN254_g2_affine_t* points, + BN254_scalar_t* scalars, + size_t batch_size, + size_t msm_size, + size_t device_id); +int commit_g2_cuda_bn254( + BN254_g2_projective_t* d_out, + BN254_scalar_t* d_scalars, + BN254_g2_affine_t* d_points, + size_t count, + unsigned large_bucket_factor, + size_t device_id); +int commit_batch_g2_cuda_bn254( + BN254_g2_projective_t* d_out, + BN254_scalar_t* d_scalars, + BN254_g2_affine_t* d_points, + size_t count, + size_t batch_size, + size_t device_id, + cudaStream_t stream); + +#ifdef __cplusplus +} +#endif + +#endif /* _BN254_MSM_H */ diff --git a/src/aztec/gpu/headers/ntt.h b/src/aztec/gpu/headers/ntt.h new file mode 100644 index 00000000..4ca148e3 --- /dev/null +++ b/src/aztec/gpu/headers/ntt.h @@ -0,0 +1,193 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ntt.h + +#ifndef _BN254_NTT_H +#define _BN254_NTT_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Incomplete declaration of BN254 projective and affine structs +typedef struct BN254_projective_t BN254_projective_t; +typedef struct BN254_affine_t BN254_affine_t; +typedef struct BN254_scalar_t BN254_scalar_t; + +typedef struct BN254_g2_projective_t BN254_g2_projective_t; +typedef struct BN254_g2_affine_t BN254_g2_affine_t; + +int ntt_cuda_bn254(BN254_scalar_t* arr, uint32_t n, bool inverse, size_t device_id); +int ntt_batch_cuda_bn254(BN254_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +int ecntt_cuda_bn254(BN254_projective_t* arr, uint32_t n, bool inverse, size_t device_id); +int ecntt_batch_cuda_bn254( + BN254_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id); + +BN254_scalar_t* +build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream); +int interpolate_scalars_cuda_bn254( + BN254_scalar_t* d_out, + BN254_scalar_t* d_evaluations, + BN254_scalar_t* d_domain, + unsigned n, + unsigned device_id, + size_t stream); +int interpolate_scalars_batch_cuda_bn254( + BN254_scalar_t* d_out, + BN254_scalar_t* d_evaluations, + BN254_scalar_t* d_domain, + unsigned n, + unsigned batch_size, + size_t device_id, + size_t stream); +int interpolate_points_cuda_bn254( + BN254_projective_t* d_out, + BN254_projective_t* d_evaluations, + BN254_scalar_t* d_domain, + unsigned n, + size_t device_id, + size_t stream); +int interpolate_points_batch_cuda_bn254( + BN254_projective_t* d_out, + BN254_projective_t* d_evaluations, + BN254_scalar_t* d_domain, + unsigned n, + unsigned batch_size, + size_t device_id, + size_t stream); +int interpolate_scalars_on_coset_cuda_bn254( + BN254_scalar_t* d_out, + BN254_scalar_t* d_evaluations, + BN254_scalar_t* d_domain, + unsigned n, + BN254_scalar_t* coset_powers, + size_t device_id, + size_t stream); +int interpolate_scalars_batch_on_coset_cuda_bn254( + BN254_scalar_t* d_out, + BN254_scalar_t* d_evaluations, + BN254_scalar_t* d_domain, + unsigned n, + unsigned batch_size, + BN254_scalar_t* coset_powers, + size_t device_id, + size_t stream); +int evaluate_scalars_cuda_bn254( + BN254_scalar_t* d_out, + BN254_scalar_t* d_coefficients, + BN254_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned device_id, + size_t stream); +int evaluate_scalars_batch_cuda_bn254( + BN254_scalar_t* d_out, + BN254_scalar_t* d_coefficients, + BN254_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned batch_size, + size_t device_id, + size_t stream); +int evaluate_points_cuda_bn254( + BN254_projective_t* d_out, + BN254_projective_t* d_coefficients, + BN254_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + size_t device_id, + size_t stream); +int evaluate_points_batch_cuda_bn254( + BN254_projective_t* d_out, + BN254_projective_t* d_coefficients, + BN254_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned batch_size, + size_t device_id, + size_t stream); +int evaluate_scalars_on_coset_cuda_bn254( + BN254_scalar_t* d_out, + BN254_scalar_t* d_coefficients, + BN254_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + BN254_scalar_t* coset_powers, + unsigned device_id, + size_t stream); +int evaluate_scalars_on_coset_batch_cuda_bn254( + BN254_scalar_t* d_out, + BN254_scalar_t* d_coefficients, + BN254_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned batch_size, + BN254_scalar_t* coset_powers, + size_t device_id, + size_t stream); +int evaluate_points_on_coset_cuda_bn254( + BN254_projective_t* d_out, + BN254_projective_t* d_coefficients, + BN254_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + BN254_scalar_t* coset_powers, + size_t device_id, + size_t stream); +int evaluate_points_on_coset_batch_cuda_bn254( + BN254_projective_t* d_out, + BN254_projective_t* d_coefficients, + BN254_scalar_t* d_domain, + unsigned domain_size, + unsigned n, + unsigned batch_size, + BN254_scalar_t* coset_powers, + size_t device_id, + size_t stream); +int reverse_order_scalars_cuda_bn254(BN254_scalar_t* arr, int n, size_t device_id, size_t stream); +int reverse_order_scalars_batch_cuda_bn254(BN254_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream); +int reverse_order_points_cuda_bn254(BN254_projective_t* arr, int n, size_t device_id, size_t stream); +int reverse_order_points_batch_cuda_bn254( + BN254_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream); +int add_scalars_cuda_bn254( + BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream); +int sub_scalars_cuda_bn254( + BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream); +int to_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream); +int from_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream); + +// points g1 +int to_montgomery_proj_points_cuda_bn254(BN254_projective_t* d_inout, unsigned n, size_t stream); +int from_montgomery_proj_points_cuda_bn254(BN254_projective_t* d_inout, unsigned n, size_t stream); +int to_montgomery_aff_points_cuda_bn254(BN254_affine_t* d_inout, unsigned n, size_t stream); +int from_montgomery_aff_points_cuda_bn254(BN254_affine_t* d_inout, unsigned n, size_t stream); + +// points g2 +int to_montgomery_proj_points_g2_cuda_bn254(BN254_g2_projective_t* d_inout, unsigned n, size_t stream); +int from_montgomery_proj_points_g2_cuda_bn254(BN254_g2_projective_t* d_inout, unsigned n, size_t stream); +int to_montgomery_aff_points_g2_cuda_bn254(BN254_g2_affine_t* d_inout, unsigned n, size_t stream); +int from_montgomery_aff_points_g2_cuda_bn254(BN254_g2_affine_t* d_inout, unsigned n, size_t stream); + +#ifdef __cplusplus +} +#endif + +#endif /* _BN254_NTT_H */ diff --git a/src/aztec/gpu/headers/params.cuh b/src/aztec/gpu/headers/params.cuh new file mode 100644 index 00000000..cf7f3e5f --- /dev/null +++ b/src/aztec/gpu/headers/params.cuh @@ -0,0 +1,188 @@ +#pragma once +#include "../utils/storage.cuh" + +namespace PARAMS_BN254 { + struct fp_config { + static constexpr unsigned limbs_count = 8; + static constexpr unsigned omegas_count = 28; + static constexpr unsigned modulus_bit_count = 254; + + static constexpr storage modulus = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, + 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}; + static constexpr storage modulus_2 = {0xe0000002, 0x87c3eb27, 0xf372e122, 0x5067d090, + 0x0302b0ba, 0x70a08b6d, 0xc2634053, 0x60c89ce5}; + static constexpr storage modulus_4 = {0xc0000004, 0x0f87d64f, 0xe6e5c245, 0xa0cfa121, + 0x06056174, 0xe14116da, 0x84c680a6, 0xc19139cb}; + static constexpr storage<2 * limbs_count> modulus_wide = { + 0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage<2 * limbs_count> modulus_squared = { + 0xe0000001, 0x08c3eb27, 0xdcb34000, 0xc7f26223, 0x68c9bb7f, 0xffe9a62c, 0xe821ddb0, 0xa6ce1975, + 0x47b62fe7, 0x2c77527b, 0xd379d3df, 0x85f73bb0, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8}; + static constexpr storage<2 * limbs_count> modulus_squared_2 = { + 0xc0000002, 0x1187d64f, 0xb9668000, 0x8fe4c447, 0xd19376ff, 0xffd34c58, 0xd043bb61, 0x4d9c32eb, + 0x8f6c5fcf, 0x58eea4f6, 0xa6f3a7be, 0x0bee7761, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970}; + static constexpr storage<2 * limbs_count> modulus_squared_4 = { + 0x80000004, 0x230fac9f, 0x72cd0000, 0x1fc9888f, 0xa326edff, 0xffa698b1, 0xa08776c3, 0x9b3865d7, + 0x1ed8bf9e, 0xb1dd49ed, 0x4de74f7c, 0x17dceec3, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1}; + + static constexpr storage m = {0xbe1de925, 0x620703a6, 0x09e880ae, 0x71448520, + 0x68073014, 0xab074a58, 0x623a04a7, 0x54a47462}; + static constexpr storage one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695, + 0x7879462e, 0x666ea36f, 0x9a07df2f, 0xe0a77c1}; + static constexpr storage montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x90ef5a9, + 0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951}; + + static constexpr storage_array omega = { + {{0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}, + {0x8f703636, 0x23120470, 0xfd736bec, 0x5cea24f6, 0x3fd84104, 0x048b6e19, 0xe131a029, 0x30644e72}, + {0xc1bd5e80, 0x948dad4a, 0xf8170a0a, 0x52627366, 0x96afef36, 0xec9b9e2f, 0xc8c14f22, 0x2b337de1}, + {0xe306460b, 0xb11509c6, 0x174efb98, 0x996dfbe1, 0x94dd508c, 0x1c6e4f45, 0x16cbbf4e, 0x21082ca2}, + {0x3bb512d0, 0x3eed4c53, 0x838eeb1d, 0x9c18d51b, 0x47c0b2a9, 0x9678200d, 0x306b93d2, 0x09c532c6}, + {0x118f023a, 0xdb94fb05, 0x26e324be, 0x46a6cb24, 0x49bdadf2, 0xc24cdb76, 0x5b080fca, 0x1418144d}, + {0xba9d1811, 0x9d0e470c, 0xb6f24c79, 0x1dcb5564, 0xe85943e0, 0xdf5ce19c, 0xad310991, 0x16e73dfd}, + {0x74a57a76, 0xc8936191, 0x6750f230, 0x61794254, 0x9f36ffb0, 0xf086204a, 0xa6148404, 0x07b0c561}, + {0x470157ce, 0x893a7fa1, 0xfc782d75, 0xe8302a41, 0xdd9b0675, 0xffc02c0e, 0xf6e72f5b, 0x0f1ded1e}, + {0xbc2e5912, 0x11f995e1, 0xa8d2d7ab, 0x39ba79c0, 0xb08771e3, 0xebbebc2b, 0x7017a420, 0x06fd19c1}, + {0x769a2ee2, 0xd00a58f9, 0x7494f0ca, 0xb8c12c17, 0xa5355d71, 0xb4027fd7, 0x99c5042b, 0x027a3584}, + {0x0042d43a, 0x1c477572, 0x6f039bb9, 0x76f169c7, 0xfd5a90a9, 0x01ddd073, 0xde2fd10f, 0x0931d596}, + {0x9bbdd310, 0x4aa49b8d, 0x8e3a2d76, 0xd31bf3e2, 0x78b2667b, 0x001deac8, 0xb869ae62, 0x006fab49}, + {0x617c6e85, 0xadaa01c2, 0x7420aae6, 0xb4a93ee1, 0x0ddca8a8, 0x1f4e51b8, 0xcdd9e481, 0x2d965651}, + {0x4e26ecfb, 0xa93458fd, 0x4115a009, 0x022a2a2d, 0x69ec2bd0, 0x017171fa, 0x5941dc91, 0x2d1ba66f}, + {0xdaac43b7, 0xd1628ba2, 0xe4347e7d, 0x16c8601d, 0xe081dcff, 0x649abebd, 0x5981ed45, 0x00eeb2cb}, + {0xce8f58e5, 0x276e5858, 0x5655210e, 0x0512eca9, 0xe70e61f3, 0xc3708cc6, 0xa7d74902, 0x1bf82deb}, + {0x7dcdc0e0, 0x84c6bfa5, 0x13f4d1bd, 0xc57088ff, 0xb5b95e4d, 0x5c0176fb, 0x3a8d46c1, 0x19ddbcaf}, + {0x613f6cbd, 0x5c1d597f, 0x8357473a, 0x30525841, 0x968e4915, 0x51829353, 0x844bca52, 0x2260e724}, + {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x06d1e303, 0x27508aba, 0x0a0ed063, 0x26125da1}, + {0xfcd0b523, 0xb2c87885, 0xca5a5ce3, 0x58f50577, 0x8598fc8c, 0x4222150e, 0xae2bdd1a, 0x1ded8980}, + {0xa219447e, 0xa76dde56, 0x359eebbb, 0xec1a1f05, 0x8be08215, 0xcda0ceb6, 0xb1f8d9a7, 0x1ad92f46}, + {0xab80c59d, 0xb54d4506, 0x22dd991f, 0x5680c640, 0xbc23a139, 0x6b7bcf70, 0x5ab4c74d, 0x0210fe63}, + {0xe32b045b, 0x1c25f1e3, 0x2e832696, 0x145e0db8, 0x71c6441f, 0x852e2a03, 0x845d50d2, 0x0c9fabc7}, + {0xb878331a, 0xeccd4f3e, 0x8dc6d26e, 0x7b26b748, 0xd9130cd4, 0xa19b0361, 0x326341ef, 0x2a734ebb}, + {0x2f4e9212, 0x1c79bd57, 0x3d68f9ae, 0x605b52b6, 0xb8d89d4a, 0x0113eff9, 0xf1ff73b2, 0x1067569a}, + {0x80928c44, 0x034afc45, 0xf6437da2, 0xb4823532, 0x6dc6e364, 0x5f256a9f, 0xb363ebe8, 0x049ae702}, + {0x725b19f0, 0x9bd61b6e, 0x41112ed4, 0x402d111e, 0x8ef62abc, 0x00e0a7eb, 0xa58a7e85, 0x2a3c09f0}}}; + + static constexpr storage_array omega_inv = { + {{0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}, + {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000}, + {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711}, + {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf}, + {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136}, + {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2}, + {0xc64427d7, 0xdddf985f, 0xa49e95bd, 0xaa4f964a, 0x5def8b04, 0x427c045f, 0x7969b732, 0x1641c053}, + {0x0329f5d6, 0x692c553d, 0x8712848a, 0xa54cf8c6, 0x38e2b5e6, 0x64751ad9, 0x7422fad3, 0x204bd327}, + {0xaf6b3e4e, 0x52f26c0f, 0xf0bcc0c8, 0x4c277a07, 0xe4fcfcab, 0x546875d5, 0xaa9995b3, 0x09d8f821}, + {0xb2e5cc71, 0xcaa2e1e9, 0x6e43404e, 0xed42b68e, 0x7a2c7f0a, 0x6ed80915, 0xde3c86d6, 0x1c4042c7}, + {0x579d71ae, 0x20a3a65d, 0x0adc4420, 0xfd7efed8, 0xfddabf54, 0x3bb6dcd7, 0xbc73d07b, 0x0fa9bb21}, + {0xc79e0e57, 0xb6f70f8d, 0xa04e05ac, 0x269d3fde, 0x2ba088d9, 0xcf2e371c, 0x11b88d9c, 0x1af864d2}, + {0xabd95dc9, 0x3b0b205a, 0x978188ca, 0xc8df74fa, 0x6a1cb6c8, 0x08e124db, 0xbfac6104, 0x1670ed58}, + {0x641c8410, 0xf8eee934, 0x677771c0, 0xf40976b0, 0x558e6e8c, 0x11680d42, 0x06e7e9e9, 0x281c036f}, + {0xb2dbc0b4, 0xc92a742f, 0x4d384e68, 0xc3f02842, 0x2fa43d0d, 0x22701b6f, 0xe4590b37, 0x05d33766}, + {0x02d842d4, 0x922d5ac8, 0xc830e4c6, 0x91126414, 0x082f37e0, 0xe92338c0, 0x7fe704e8, 0x0b5d56b7}, + {0xd96f0d22, 0x20e75251, 0x6bd4e8c9, 0xc01c7f08, 0xf9dd50c4, 0x37d8b00b, 0xc43ca872, 0x244cf010}, + {0x66c5174c, 0x7a823174, 0x22d5ad70, 0x7dbe118c, 0x111119c5, 0xf8d7c71d, 0x83780e87, 0x036853f0}, + {0xca535321, 0xd98f9924, 0xe66e6c81, 0x22dbc0ef, 0x664ae1b7, 0xa15cf806, 0xa314fb67, 0x06e402c0}, + {0xe26c91f3, 0x0852a8fd, 0x3baca626, 0x521f45cb, 0x2c51bfca, 0xab6473bc, 0x2100895f, 0x100c332d}, + {0xa376d0f0, 0xf5fac783, 0x940797d3, 0x50fd246e, 0x145f5278, 0xab14ecc1, 0x41091b14, 0x19c6dfb8}, + {0x7faa1396, 0x43dc52e2, 0x4beced23, 0xd437be9d, 0x6d3c38c3, 0xecc11e9c, 0x0c74a876, 0x2eb58439}, + {0xd69ca83b, 0x811b03e7, 0xa1a6eadf, 0x126a786b, 0x4e2b8e61, 0x1dd75c9f, 0xbda6792b, 0x2165a1a5}, + {0x110b737b, 0x02e1d4d1, 0xb323a164, 0x7be1488d, 0x9cd06163, 0xa334d317, 0xdb50e9cd, 0x2710c370}, + {0x9550fe47, 0x45d2f3cb, 0xf6a8efc4, 0x5f43327b, 0xe993ee18, 0x5bcd0d50, 0xb21de952, 0x27f035bd}, + {0x232e3983, 0x1d63cbae, 0xaa1b58e2, 0xac815161, 0x6aeb019e, 0x531f42a5, 0x03ca2ef5, 0x2dcd51d9}, + {0x980db869, 0xa8b64ba8, 0xc9718f6c, 0x4c787f72, 0x15d27ced, 0x7746a25a, 0x435a46e9, 0x110bf78f}, + {0x9d18157e, 0x72394277, 0xfd399d5d, 0xec9d51f8, 0x49d5387f, 0x6117635d, 0x9c229cd5, 0x01b77519}}}; + + static constexpr storage_array inv = { + {{0xf8000001, 0xa1f0fac9, 0x3cdcb848, 0x9419f424, 0x40c0ac2e, 0xdc2822db, 0x7098d014, 0x18322739}, + {0xf4000001, 0xf2e9782e, 0x5b4b146c, 0xde26ee36, 0xe1210245, 0x4a3c3448, 0x28e5381f, 0x244b3ad6}, + {0x72000001, 0x1b65b6e1, 0x6a82427f, 0x832d6b3f, 0xb1512d51, 0x81463cff, 0x850b6c24, 0x2a57c4a4}, + {0xb1000001, 0x2fa3d63a, 0xf21dd988, 0x55b0a9c3, 0x196942d7, 0x1ccb415b, 0xb31e8627, 0x2d5e098b}, + {0x50800001, 0xb9c2e5e7, 0x35eba50c, 0x3ef24906, 0xcd754d9a, 0x6a8dc388, 0x4a281328, 0x2ee12bff}, + {0xa0400001, 0xfed26dbd, 0x57d28ace, 0xb39318a7, 0xa77b52fb, 0x116f049f, 0x15acd9a9, 0x2fa2bd39}, + {0xc8200001, 0x215a31a8, 0xe8c5fdb0, 0x6de38077, 0x147e55ac, 0x64dfa52b, 0xfb6f3ce9, 0x300385d5}, + {0x5c100001, 0xb29e139e, 0x313fb720, 0xcb0bb460, 0xcaffd704, 0x8e97f570, 0x6e506e89, 0x3033ea24}, + {0x26080001, 0xfb400499, 0x557c93d8, 0xf99fce54, 0xa64097b0, 0xa3741d93, 0xa7c10759, 0x304c1c4b}, + {0x8b040001, 0x1f90fd16, 0x679b0235, 0x10e9db4e, 0x13e0f807, 0xade231a5, 0x447953c1, 0x3058355f}, + {0x3d820001, 0x31b97955, 0x70aa3963, 0x1c8ee1cb, 0xcab12832, 0xb3193bad, 0x12d579f5, 0x305e41e9}, + {0x96c10001, 0x3acdb774, 0xf531d4fa, 0xa2616509, 0x26194047, 0xb5b4c0b2, 0xfa038d0f, 0x3061482d}, + {0x43608001, 0xbf57d684, 0x3775a2c5, 0x654aa6a9, 0x53cd4c52, 0xb7028334, 0x6d9a969c, 0x3062cb50}, + {0x19b04001, 0x819ce60c, 0xd89789ab, 0xc6bf4778, 0x6aa75257, 0x37a96475, 0xa7661b63, 0x30638ce1}, + {0x04d82001, 0x62bf6dd0, 0xa9287d1e, 0x777997e0, 0xf614555a, 0x77fcd515, 0x444bddc6, 0x3063edaa}, + {0xfa6c1001, 0xd350b1b1, 0x9170f6d7, 0xcfd6c014, 0x3bcad6db, 0x18268d66, 0x92bebef8, 0x30641e0e}, + {0xf5360801, 0x8b9953a2, 0x859533b4, 0x7c05542e, 0x5ea6179c, 0xe83b698e, 0xb9f82f90, 0x30643640}, + {0x729b0401, 0xe7bda49b, 0x7fa75222, 0xd21c9e3b, 0x7013b7fc, 0x5045d7a2, 0xcd94e7dd, 0x30644259}, + {0xb14d8201, 0x15cfcd17, 0xfcb0615a, 0xfd284341, 0x78ca882c, 0x844b0eac, 0x57634403, 0x30644866}, + {0xd0a6c101, 0xacd8e155, 0x3b34e8f5, 0x12ae15c5, 0x7d25f045, 0x9e4daa31, 0x9c4a7216, 0x30644b6c}, + {0xe0536081, 0x785d6b74, 0xda772cc3, 0x1d70ff06, 0xff53a451, 0x2b4ef7f3, 0xbebe0920, 0x30644cef}, + {0x6829b041, 0x5e1fb084, 0xaa184eaa, 0x22d273a7, 0x406a7e57, 0xf1cf9ed5, 0x4ff7d4a4, 0x30644db1}, + {0x2c14d821, 0xd100d30c, 0x11e8df9d, 0x25832df8, 0xe0f5eb5a, 0x550ff245, 0x1894ba67, 0x30644e12}, + {0x0e0a6c11, 0x8a716450, 0x45d12817, 0xa6db8b20, 0x313ba1db, 0x86b01bfe, 0x7ce32d48, 0x30644e42}, + {0xff053609, 0x6729acf1, 0x5fc54c54, 0x6787b9b4, 0x595e7d1c, 0x1f8030da, 0xaf0a66b9, 0x30644e5a}, + {0xf7829b05, 0xd585d142, 0x6cbf5e72, 0xc7ddd0fe, 0x6d6feabc, 0x6be83b48, 0xc81e0371, 0x30644e66}, + {0x73c14d83, 0x0cb3e36b, 0x733c6782, 0xf808dca3, 0x7778a18c, 0x921c407f, 0xd4a7d1cd, 0x30644e6c}, + {0xb1e0a6c2, 0xa84aec7f, 0xf67aec09, 0x101e6275, 0xfc7cfcf5, 0xa536431a, 0xdaecb8fb, 0x30644e6f}}}; + }; + + struct fq_config { + static constexpr unsigned limbs_count = 8; + static constexpr unsigned modulus_bit_count = 254; + static constexpr storage modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, + 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}; + static constexpr storage modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522, + 0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5}; + static constexpr storage modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45, + 0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb}; + static constexpr storage<2 * limbs_count> modulus_wide = { + 0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage<2 * limbs_count> modulus_squared = { + 0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95, + 0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8}; + static constexpr storage<2 * limbs_count> modulus_squared_2 = { + 0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a, + 0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970}; + static constexpr storage<2 * limbs_count> modulus_squared_4 = { + 0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55, + 0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1}; + static constexpr storage m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17, + 0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462}; + static constexpr storage one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0xa78eb28, + 0x7879462c, 0x666ea36f, 0x9a07df2f, 0xe0a77c1}; + static constexpr storage montgomery_r_inv = {0x14afa37, 0xed84884a, 0x278edf8, 0xeb202285, + 0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571}; + + // i^2, the square of the imaginary unit for the extension field + static constexpr uint32_t i_squared = 1; + // true if i^2 is negative + static constexpr bool i_squared_is_negative = true; + // G1 and G2 generators + static constexpr storage g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4, + 0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef}; + static constexpr storage g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933, + 0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393}; + static constexpr storage g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769, + 0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5}; + static constexpr storage g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133, + 0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0}; + }; + + static constexpr storage weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr storage weierstrass_b_g2_re = { + 0x24a138e5, 0x3267e6dc, 0x59dbefa3, 0xb5b4c5e5, 0x1be06ac3, 0x81be1899, 0xceb8aaae, 0x2b149d40}; + static constexpr storage weierstrass_b_g2_im = { + 0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0}; +} // namespace PARAMS_BN254 diff --git a/src/aztec/gpu/headers/projective.cuh b/src/aztec/gpu/headers/projective.cuh new file mode 100644 index 00000000..41cfb77a --- /dev/null +++ b/src/aztec/gpu/headers/projective.cuh @@ -0,0 +1,166 @@ +#pragma once + +#include "affine.cuh" + +template +class Projective +{ + friend Affine; + +public: + FF x; + FF y; + FF z; + + static HOST_DEVICE_INLINE Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; } + + static HOST_DEVICE_INLINE Affine to_affine(const Projective& point) + { + FF denom = FF::inverse(point.z); + return {point.x * denom, point.y * denom}; + } + + static HOST_DEVICE_INLINE Projective from_affine(const Affine& point) { return {point.x, point.y, FF::one()}; } + + static HOST_DEVICE_INLINE Projective generator() { return {FF::generator_x(), FF::generator_y(), FF::one()}; } + + static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; } + + friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2) + { + const FF X1 = p1.x; // < 2 + const FF Y1 = p1.y; // < 2 + const FF Z1 = p1.z; // < 2 + const FF X2 = p2.x; // < 2 + const FF Y2 = p2.y; // < 2 + const FF Z2 = p2.z; // < 2 + const FF t00 = X1 * X2; // t00 ← X1 · X2 < 2 + const FF t01 = Y1 * Y2; // t01 ← Y1 · Y2 < 2 + const FF t02 = Z1 * Z2; // t02 ← Z1 · Z2 < 2 + const FF t03 = X1 + Y1; // t03 ← X1 + Y1 < 4 + const FF t04 = X2 + Y2; // t04 ← X2 + Y2 < 4 + const FF t05 = t03 * t04; // t03 ← t03 · t04 < 3 + const FF t06 = t00 + t01; // t06 ← t00 + t01 < 4 + const FF t07 = t05 - t06; // t05 ← t05 − t06 < 2 + const FF t08 = Y1 + Z1; // t08 ← Y1 + Z1 < 4 + const FF t09 = Y2 + Z2; // t09 ← Y2 + Z2 < 4 + const FF t10 = t08 * t09; // t10 ← t08 · t09 < 3 + const FF t11 = t01 + t02; // t11 ← t01 + t02 < 4 + const FF t12 = t10 - t11; // t12 ← t10 − t11 < 2 + const FF t13 = X1 + Z1; // t13 ← X1 + Z1 < 4 + const FF t14 = X2 + Z2; // t14 ← X2 + Z2 < 4 + const FF t15 = t13 * t14; // t15 ← t13 · t14 < 3 + const FF t16 = t00 + t02; // t16 ← t00 + t02 < 4 + const FF t17 = t15 - t16; // t17 ← t15 − t16 < 2 + const FF t18 = t00 + t00; // t18 ← t00 + t00 < 2 + const FF t19 = t18 + t00; // t19 ← t18 + t00 < 2 + const FF t20 = FF::template mul_unsigned<3>(FF::template mul_const(t02)); // t20 ← b3 · t02 < 2 + const FF t21 = t01 + t20; // t21 ← t01 + t20 < 2 + const FF t22 = t01 - t20; // t22 ← t01 − t20 < 2 + const FF t23 = FF::template mul_unsigned<3>(FF::template mul_const(t17)); // t23 ← b3 · t17 < 2 + const auto t24 = FF::mul_wide(t12, t23); // t24 ← t12 · t23 < 2 + const auto t25 = FF::mul_wide(t07, t22); // t25 ← t07 · t22 < 2 + const FF X3 = FF::reduce(t25 - t24); // X3 ← t25 − t24 < 2 + const auto t27 = FF::mul_wide(t23, t19); // t27 ← t23 · t19 < 2 + const auto t28 = FF::mul_wide(t22, t21); // t28 ← t22 · t21 < 2 + const FF Y3 = FF::reduce(t28 + t27); // Y3 ← t28 + t27 < 2 + const auto t30 = FF::mul_wide(t19, t07); // t30 ← t19 · t07 < 2 + const auto t31 = FF::mul_wide(t21, t12); // t31 ← t21 · t12 < 2 + const FF Z3 = FF::reduce(t31 + t30); // Z3 ← t31 + t30 < 2 + return {X3, Y3, Z3}; + } + + friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); } + + friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine& p2) + { + const FF X1 = p1.x; // < 2 + const FF Y1 = p1.y; // < 2 + const FF Z1 = p1.z; // < 2 + const FF X2 = p2.x; // < 2 + const FF Y2 = p2.y; // < 2 + const FF t00 = X1 * X2; // t00 ← X1 · X2 < 2 + const FF t01 = Y1 * Y2; // t01 ← Y1 · Y2 < 2 + const FF t02 = Z1; // t02 ← Z1 < 2 + const FF t03 = X1 + Y1; // t03 ← X1 + Y1 < 4 + const FF t04 = X2 + Y2; // t04 ← X2 + Y2 < 4 + const FF t05 = t03 * t04; // t03 ← t03 · t04 < 3 + const FF t06 = t00 + t01; // t06 ← t00 + t01 < 4 + const FF t07 = t05 - t06; // t05 ← t05 − t06 < 2 + const FF t08 = Y1 + Z1; // t08 ← Y1 + Z1 < 4 + const FF t09 = Y2 + FF::one(); // t09 ← Y2 + 1 < 4 + const FF t10 = t08 * t09; // t10 ← t08 · t09 < 3 + const FF t11 = t01 + t02; // t11 ← t01 + t02 < 4 + const FF t12 = t10 - t11; // t12 ← t10 − t11 < 2 + const FF t13 = X1 + Z1; // t13 ← X1 + Z1 < 4 + const FF t14 = X2 + FF::one(); // t14 ← X2 + 1 < 4 + const FF t15 = t13 * t14; // t15 ← t13 · t14 < 3 + const FF t16 = t00 + t02; // t16 ← t00 + t02 < 4 + const FF t17 = t15 - t16; // t17 ← t15 − t16 < 2 + const FF t18 = t00 + t00; // t18 ← t00 + t00 < 2 + const FF t19 = t18 + t00; // t19 ← t18 + t00 < 2 + const FF t20 = FF::template mul_unsigned<3>(FF::template mul_const(t02)); // t20 ← b3 · t02 < 2 + const FF t21 = t01 + t20; // t21 ← t01 + t20 < 2 + const FF t22 = t01 - t20; // t22 ← t01 − t20 < 2 + const FF t23 = FF::template mul_unsigned<3>(FF::template mul_const(t17)); // t23 ← b3 · t17 < 2 + const auto t24 = FF::mul_wide(t12, t23); // t24 ← t12 · t23 < 2 + const auto t25 = FF::mul_wide(t07, t22); // t25 ← t07 · t22 < 2 + const FF X3 = FF::reduce(t25 - t24); // X3 ← t25 − t24 < 2 + const auto t27 = FF::mul_wide(t23, t19); // t27 ← t23 · t19 < 2 + const auto t28 = FF::mul_wide(t22, t21); // t28 ← t22 · t21 < 2 + const FF Y3 = FF::reduce(t28 + t27); // Y3 ← t28 + t27 < 2 + const auto t30 = FF::mul_wide(t19, t07); // t30 ← t19 · t07 < 2 + const auto t31 = FF::mul_wide(t21, t12); // t31 ← t21 · t12 < 2 + const FF Z3 = FF::reduce(t31 + t30); // Z3 ← t31 + t30 < 2 + return {X3, Y3, Z3}; + } + + friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine& p2) + { + return p1 + Affine::neg(p2); + } + + friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point) + { + Projective res = zero(); +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for (int i = 0; i < SCALAR_FF::NBITS; i++) { + if (i > 0) { res = res + res; } + if (scalar.get_scalar_digit(SCALAR_FF::NBITS - i - 1, 1)) { res = res + point; } + } + return res; + } + + friend HOST_DEVICE_INLINE bool operator==(const Projective& p1, const Projective& p2) + { + return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z); + } + + friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point) + { + os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }"; + return os; + } + + static HOST_DEVICE_INLINE bool is_zero(const Projective& point) + { + return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero(); + } + + static HOST_DEVICE_INLINE bool is_on_curve(const Projective& point) + { + if (is_zero(point)) return true; + bool eq_holds = + (FF::template mul_const(FF::sqr(point.z) * point.z) + FF::sqr(point.x) * point.x == + point.z * FF::sqr(point.y)); + return point.z != FF::zero() && eq_holds; + } + + static HOST_INLINE Projective rand_host() + { + SCALAR_FF rand_scalar = SCALAR_FF::rand_host(); + return rand_scalar * generator(); + } +}; diff --git a/src/aztec/gpu/headers/projective.h b/src/aztec/gpu/headers/projective.h new file mode 100644 index 00000000..0a5ec74f --- /dev/null +++ b/src/aztec/gpu/headers/projective.h @@ -0,0 +1,50 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// projective.h + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BN254_projective_t BN254_projective_t; +// typedef struct BN254_g2_projective_t BN254_g2_projective_t; +typedef struct BN254_affine_t BN254_affine_t; +typedef struct BN254_g2_affine_t BN254_g2_affine_t; +typedef struct BN254_scalar_t BN254_scalar_t; + +bool projective_is_on_curve_bn254(BN254_projective_t* point1); + +int random_scalar_bn254(BN254_scalar_t* out); +int random_projective_bn254(BN254_projective_t* out); +BN254_projective_t* projective_zero_bn254(); +int projective_to_affine_bn254(BN254_affine_t* out, BN254_projective_t* point1); +int projective_from_affine_bn254(BN254_projective_t* out, BN254_affine_t* point1); + +int random_g2_projective_bn254(BN254_g2_projective_t* out); +int g2_projective_to_affine_bn254(BN254_g2_affine_t* out, BN254_g2_projective_t* point1); +int g2_projective_from_affine_bn254(BN254_g2_projective_t* out, BN254_g2_affine_t* point1); +bool g2_projective_is_on_curve_bn254(BN254_g2_projective_t* point1); + +bool eq_bn254(BN254_projective_t* point1, BN254_projective_t* point2); +bool eq_g2_bn254(BN254_g2_projective_t* point1, BN254_g2_projective_t* point2); + +#ifdef __cplusplus +} +#endif diff --git a/src/aztec/gpu/headers/ve_mod_mult.cuh b/src/aztec/gpu/headers/ve_mod_mult.cuh new file mode 100644 index 00000000..36859c49 --- /dev/null +++ b/src/aztec/gpu/headers/ve_mod_mult.cuh @@ -0,0 +1,135 @@ +#ifndef VEC_MULT +#define VEC_MULT +#pragma once +#include +#include + +#define MAX_THREADS_PER_BLOCK 256 + +/** + * Multiply the elements of an input array by a scalar in-place. + * @param arr input array. + * @param n size of arr. + * @param n_inv scalar of type S (scalar). + */ +template +__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar) +{ + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < n) { arr[tid] = scalar * arr[tid]; } +} + +// TODO: headers for prototypes and .c .cpp .cu files for implementations +template +__global__ void vectorModMult(S* scalar_vec, E* element_vec, E* result, size_t n_elments) +{ + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < n_elments) { result[tid] = scalar_vec[tid] * element_vec[tid]; } +} + +template +int vector_mod_mult(S* vec_a, E* vec_b, E* result, size_t n_elments, cudaStream_t stream) // TODO: in place so no need + // for third result vector +{ + // Set the grid and block dimensions + int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK); + int threads_per_block = MAX_THREADS_PER_BLOCK; + + // Allocate memory on the device for the input vectors, the output vector, and the modulus + S* d_vec_a; + E *d_vec_b, *d_result; + cudaMallocAsync(&d_vec_a, n_elments * sizeof(S), stream); + cudaMallocAsync(&d_vec_b, n_elments * sizeof(E), stream); + cudaMallocAsync(&d_result, n_elments * sizeof(E), stream); + + // Copy the input vectors and the modulus from the host to the device + cudaMemcpyAsync(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice, stream); + + // Call the kernel to perform element-wise modular multiplication + vectorModMult<<>>(d_vec_a, d_vec_b, d_result, n_elments); + + cudaMemcpyAsync(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost, stream); + + cudaFreeAsync(d_vec_a, stream); + cudaFreeAsync(d_vec_b, stream); + cudaFreeAsync(d_result, stream); + + cudaStreamSynchronize(stream); + return 0; +} + +template +int vector_mod_mult_device( + S* d_vec_a, E* d_vec_b, E* d_result, size_t n_elments) // TODO: in place so no need for third result vector +{ + // Set the grid and block dimensions + int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK); + int threads_per_block = MAX_THREADS_PER_BLOCK; + + // Call the kernel to perform element-wise modular multiplication + vectorModMult<<>>(d_vec_a, d_vec_b, d_result, n_elments); + return 0; +} + +template +__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size) +{ + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < n_scalars * batch_size) { + int scalar_id = tid % n_scalars; + element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid]; + } +} + +template +int batch_vector_mult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size, cudaStream_t stream) +{ + // Set the grid and block dimensions + int NUM_THREADS = MAX_THREADS_PER_BLOCK; + int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS; + batchVectorMult<<>>(scalar_vec, element_vec, n_scalars, batch_size); + return 0; +} + +template +__global__ void matrixVectorMult(E* matrix_elements, E* vector_elements, E* result, size_t dim) +{ + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < dim) { + result[tid] = E::zero(); + for (int i = 0; i < dim; i++) + result[tid] = result[tid] + matrix_elements[tid * dim + i] * vector_elements[i]; + } +} + +template +int matrix_mod_mult(E* matrix_elements, E* vector_elements, E* result, size_t dim, cudaStream_t stream) +{ + // Set the grid and block dimensions + int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK); + int threads_per_block = MAX_THREADS_PER_BLOCK; + + // Allocate memory on the device for the input vectors, the output vector, and the modulus + E *d_matrix, *d_vector, *d_result; + cudaMallocAsync(&d_matrix, (dim * dim) * sizeof(E), stream); + cudaMallocAsync(&d_vector, dim * sizeof(E), stream); + cudaMallocAsync(&d_result, dim * sizeof(E), stream); + + // Copy the input vectors and the modulus from the host to the device + cudaMemcpyAsync(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice, stream); + + // Call the kernel to perform element-wise modular multiplication + matrixVectorMult<<>>(d_matrix, d_vector, d_result, dim); + + cudaMemcpyAsync(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost, stream); + + cudaFreeAsync(d_matrix, stream); + cudaFreeAsync(d_vector, stream); + cudaFreeAsync(d_result, stream); + + cudaStreamSynchronize(stream); + return 0; +} +#endif \ No newline at end of file diff --git a/src/aztec/gpu/headers/ve_mod_mult.h b/src/aztec/gpu/headers/ve_mod_mult.h new file mode 100644 index 00000000..fcd5753c --- /dev/null +++ b/src/aztec/gpu/headers/ve_mod_mult.h @@ -0,0 +1,45 @@ + +// Copyright 2023 Ingonyama +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by Ingonyama DO NOT EDIT + +#include +#include +// ve_mod_mult.h + +#ifndef _BN254_VEC_MULT_H +#define _BN254_VEC_MULT_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BN254_projective_t BN254_projective_t; +typedef struct BN254_scalar_t BN254_scalar_t; + +int32_t +vec_mod_mult_point_bn254(BN254_projective_t* inout, BN254_scalar_t* scalar_vec, size_t n_elments, size_t device_id); +int32_t +vec_mod_mult_scalar_bn254(BN254_scalar_t* inout, BN254_scalar_t* scalar_vec, size_t n_elments, size_t device_id); +int32_t vec_mod_mult_device_scalar_bn254( + BN254_scalar_t* inout, BN254_scalar_t* scalar_vec, size_t n_elements, size_t device_id); +int32_t matrix_vec_mod_mult_bn254( + BN254_scalar_t* matrix_flattened, BN254_scalar_t* input, BN254_scalar_t* output, size_t n_elments, size_t device_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _BN254_VEC_MULT_H */ diff --git a/src/aztec/gpu/libbn254.so b/src/aztec/gpu/libbn254.so new file mode 100755 index 00000000..b9d7beb7 Binary files /dev/null and b/src/aztec/gpu/libbn254.so differ diff --git a/src/aztec/gpu/libbn254_old.so b/src/aztec/gpu/libbn254_old.so new file mode 100755 index 00000000..a42d2493 Binary files /dev/null and b/src/aztec/gpu/libbn254_old.so differ diff --git a/src/aztec/gpu/msm/CMakeLists.txt b/src/aztec/gpu/msm/CMakeLists.txt index 8c04f9ca..e7a2f0b4 100644 --- a/src/aztec/gpu/msm/CMakeLists.txt +++ b/src/aztec/gpu/msm/CMakeLists.txt @@ -3,10 +3,13 @@ project(cuda-barretenberg CUDA) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) +link_directories(/home/ubuntu/Cuda-Barretenberg/src/aztec/gpu) + include_directories(../bignum) include_directories(../fields) include_directories(../curves) include_directories(../srs) +include_directories(../headers) # Pippenger's bucket method add_executable( @@ -23,6 +26,8 @@ target_link_libraries( reference_string_library stdlib_primitives env + bn254 + dl ) add_custom_target( @@ -42,4 +47,5 @@ target_link_libraries( field_library curve_library reference_string_library -) \ No newline at end of file +) + diff --git a/src/aztec/gpu/msm/common.cuh b/src/aztec/gpu/msm/common.cuh index 3375abee..312830de 100644 --- a/src/aztec/gpu/msm/common.cuh +++ b/src/aztec/gpu/msm/common.cuh @@ -1,5 +1,5 @@ #include "reference_string.cu" -#include "util/thread_pool.hpp" +// #include "util/thread_pool.hpp" #include "error.cuh" #include #include diff --git a/src/aztec/gpu/msm/main.cu b/src/aztec/gpu/msm/main.cu index 747e74bb..3d0ccdca 100644 --- a/src/aztec/gpu/msm/main.cu +++ b/src/aztec/gpu/msm/main.cu @@ -1,48 +1,110 @@ -#include "pippenger.cu" +// #include "pippenger.cu" #include #include #include -#include -#include -#include +// #include +// #include +// #include #include #include +#include "msm.h" +#include "ntt.h" +#include "projective.h" +#include "ve_mod_mult.h" + +#include "affine.cuh" +#include "curve_config.cuh" +#include "fields.cuh" +#include "params.cuh" +#include "../utils/storage.cuh" +#include "../utils/sharedmem.cuh" +#include "../utils/ptx.cuh" +#include "../utils/objects.cuh" +#include "../utils/mont.cuh" +#include "../utils/host_math.cuh" +#include "../utils/cuda_utils.cuh" + +#include + using namespace std; -using namespace pippenger_common; -using namespace waffle; -using namespace barretenberg; +// using namespace pippenger_common; +// using namespace waffle; +// using namespace barretenberg; + +// std::ostream& operator<<(std::ostream& os, const BN254_projective_t& point) +// { +// os << "Point { x: " << point << "; y: " << point << "; z: " << point << " }"; +// return os; +// } int main(int, char**) { - // Initialize dynamic 'msm_t' object - msm_t *msm = new msm_t(); - // Construct elliptic curve points from SRS - auto reference_string = std::make_shared(NUM_POINTS, "../srs_db/ignition"); - g1::affine_element* points = reference_string->get_monomials(); - - // Construct random scalars - std::vector scalars; - scalars.reserve(NUM_POINTS); - for (size_t i = 0; i < NUM_POINTS; ++i) { - scalars.emplace_back(fr::random_element()); + void* lib_handle = dlopen("/home/ubuntu/Cuda-Barretenberg/src/aztec/gpu/libbn254.so", RTLD_LAZY); + + if (!lib_handle) { + std::cerr << "Error loading shared library: " << dlerror() << std::endl; + return 1; } + // Projective t {[1,2,3,4,5,6,7,8], [1,2,3,4,5,6,7,8], [1,2,3,4,5,6,7,8]}; + + typedef int (*AddFunction)(BN254_projective_t *); + AddFunction add_func = reinterpret_cast(dlsym(lib_handle, "projective_is_on_curve_bn254")); + + if (!add_func) { + std::cerr << "Error loading function: " << dlerror() << std::endl; + dlclose(lib_handle); // Close the library + return 1; + } + + BN254_projective_t *t; + std::cout << "Result of add(): " << add_func(t) << std::endl; + + dlclose(lib_handle); + + exit(0); + + // Initialize dynamic 'msm_t' object + // msm_t *msm = new msm_t(); + + // // Construct elliptic curve points from SRS + // auto reference_string = std::make_shared(NUM_POINTS, "../srs_db/ignition"); + // g1::affine_element* points = reference_string->get_monomials(); + + // // Construct random scalars + // std::vector scalars; + // scalars.reserve(NUM_POINTS); + // for (size_t i = 0; i < NUM_POINTS; ++i) { + // scalars.emplace_back(fr::random_element()); + // } + + // g1::affine_element* icicle_result; + // msm_cuda_bn254(icicle_result, points, &scalars[0], scalars.size(), 0); + // Number of streams - int num_streams = 1; + // int num_streams = 1; + + // // Initialize dynamic pippenger 'context' object + // Context *context = msm->pippenger_initialize(points, &scalars[0], num_streams, NUM_POINTS); + + // // Execute "Double-And-Add" reference kernel + // g1_gpu::element *result_1 = msm->msm_double_and_add(context, NUM_POINTS, points, &scalars[0]); + + // // Execute "Pippenger's Bucket Method" kernel + // g1_gpu::element **result_2 = msm->msm_bucket_method(context, points, &scalars[0], num_streams); - // Initialize dynamic pippenger 'context' object - Context *context = msm->pippenger_initialize(points, &scalars[0], num_streams, NUM_POINTS); + // // Print results + // context->pipp.print_result(result_1, result_2); - // Execute "Double-And-Add" reference kernel - g1_gpu::element *result_1 = msm->msm_double_and_add(context, NUM_POINTS, points, &scalars[0]); + // // Verify the final results are equal + // context->pipp.verify_result(result_1, result_2); +} - // Execute "Pippenger's Bucket Method" kernel - g1_gpu::element **result_2 = msm->msm_bucket_method(context, points, &scalars[0], num_streams); - // Print results - context->pipp.print_result(result_1, result_2); +// TODO: figure out how to create an instance of object (x, y, z) +// TODO: Figure out if problem is with shared library or something else leading to type resolving problems +// TODO: Once that works, see if allocating memory solves our problems - // Verify the final results are equal - context->pipp.verify_result(result_1, result_2); -} \ No newline at end of file +// TODO: Any changes to polynoial IOP wrapper that calls the MSM code +// TODO: Correctness checks diff --git a/src/aztec/gpu/msm/util/thread_pool.hpp b/src/aztec/gpu/msm/util/thread_pool.hpp deleted file mode 100644 index 52a15881..00000000 --- a/src/aztec/gpu/msm/util/thread_pool.hpp +++ /dev/null @@ -1,171 +0,0 @@ -#ifndef __THREAD_POOL_T_HPP__ -#define __THREAD_POOL_T_HPP__ - -#if __cplusplus < 201103L && !(defined(_MSVC_LANG) && _MSVC_LANG >= 201103L) -# error C++11 or later is required. -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef _GNU_SOURCE -# include -#endif - -using namespace std; - -class thread_pool_t { -private: - std::vector threads; - - std::mutex mtx; // Inter-thread synchronization - std::condition_variable cvar; - std::atomic done; - - typedef std::function job_t; - std::deque fifo; - -public: - thread_pool_t(unsigned int num_threads = 0) : done(false) - { - if (num_threads == 0) { - num_threads = std::thread::hardware_concurrency(); -#ifdef _GNU_SOURCE - cpu_set_t set; - if (sched_getaffinity(0, sizeof(set), &set) == 0) { - size_t i, n; - for (n = 0, i = num_threads; i--;) - n += CPU_ISSET(i, &set); - if (n != 0) - num_threads = n; - } -#endif - } - - threads.reserve(num_threads); - - for (unsigned int i = 0; i < num_threads; i++) - threads.push_back(std::thread([this]() { while (execute()); })); - } - - virtual ~thread_pool_t() - { - done = true; - cvar.notify_all(); - for (auto& tid : threads) - tid.join(); - } - - size_t size() { return threads.size(); } - - template void spawn(Workable work) - { - std::unique_lock lock(mtx); - fifo.emplace_back(job_t(work)); - cvar.notify_one(); // wake up a worker thread - } - -private: - bool execute() - { - job_t work; - { - std::unique_lock lock(mtx); - - while (!done && fifo.empty()) - cvar.wait(lock); - - if (done && fifo.empty()) - return false; - - work = fifo.front(); fifo.pop_front(); - } - work(); - - return true; - } - -public: - // call work(size_t idx) with idx=[0..num_items) in parallel, e.g. - // pool.par_map(20, [&](size_t i) { std::cout << i << std::endl; }); - template - void par_map(size_t num_items, Workable work, size_t max_workers = 0) - { - size_t num_workers = std::min(size(), num_items); - if (max_workers > 0) - num_workers = std::min(num_workers, max_workers); - - std::atomic counter(0); - std::atomic done(num_workers); - std::mutex b_mtx; - std::condition_variable barrier; - - while (num_workers--) { - spawn([&, num_items]() { - size_t idx; - while ((idx = counter.fetch_add(1, std::memory_order_relaxed)) - < num_items) - work(idx); - if (--done == 0) { - std::unique_lock lock(b_mtx); - barrier.notify_one(); - } - }); - } - - std::unique_lock lock(b_mtx); - barrier.wait(lock, [&] { return done == 0; }); - } -}; - -template class channel_t { -private: - std::deque fifo; - std::mutex mtx; - std::condition_variable cvar; - -public: - void send(const T& msg) - { - std::unique_lock lock(mtx); - fifo.push_back(msg); - cvar.notify_one(); - } - - T recv() - { - std::unique_lock lock(mtx); - cvar.wait(lock, [&] { return !fifo.empty(); }); - auto msg = fifo.front(); fifo.pop_front(); - return msg; - } -}; - -template class counter_t { - struct inner { - std::atomic val; - std::atomic ref_cnt; - inline inner(T v) { val = v, ref_cnt = 1; }; - }; - inner *ptr; -public: - counter_t(T v=0) { ptr = new inner(v); } - counter_t(const counter_t& r) - { (ptr = r.ptr)->ref_cnt.fetch_add(1, std::memory_order_relaxed); } - ~counter_t() - { - if (ptr->ref_cnt.fetch_sub(1, std::memory_order_seq_cst) == 1) - delete ptr; - } - size_t ref_cnt() const { return ptr->ref_cnt; } - T operator++(int) const { return ptr->val.fetch_add(1, std::memory_order_relaxed); } - T operator++() const { return ptr->val++ + 1; } - T operator--(int) const { return ptr->val.fetch_sub(1, std::memory_order_relaxed); } - T operator--() const { return ptr->val-- - 1; } -}; -#endif // __THREAD_POOL_T_HPP__ \ No newline at end of file diff --git a/src/aztec/gpu/plonk/plonk.cu b/src/aztec/gpu/plonk/plonk.cu index 94f83bc8..b494fd9c 100644 --- a/src/aztec/gpu/plonk/plonk.cu +++ b/src/aztec/gpu/plonk/plonk.cu @@ -40,4 +40,6 @@ int main(int, char**) { // Generate and verify proof plonk_proof proof = prover->construct_proof(); verifier.verify_proof(proof); + + cout << "verified proof!" << endl; } \ No newline at end of file diff --git a/src/aztec/gpu/plonk/queue_wrapper.cu b/src/aztec/gpu/plonk/queue_wrapper.cu index a7b1c347..637225be 100644 --- a/src/aztec/gpu/plonk/queue_wrapper.cu +++ b/src/aztec/gpu/plonk/queue_wrapper.cu @@ -19,23 +19,23 @@ void QueueWrapper::process_queue() { auto reference_string = std::make_shared(NUM_POINTS, "../srs_db"); g1::affine_element* pointss = reference_string->get_monomials(); - cout << "points is: " << pointss[0].x.data[0] << endl; - cout << "points is: " << pointss[0].x.data[1] << endl; - cout << "points is: " << pointss[0].x.data[2] << endl; - cout << "points is: " << pointss[0].x.data[3] << endl; - cout << "points is: " << pointss[0].y.data[0] << endl; - cout << "points is: " << pointss[0].y.data[1] << endl; - cout << "points is: " << pointss[0].y.data[2] << endl; - cout << "points is: " << pointss[0].y.data[3] << endl; + // cout << "points is: " << pointss[0].x.data[0] << endl; + // cout << "points is: " << pointss[0].x.data[1] << endl; + // cout << "points is: " << pointss[0].x.data[2] << endl; + // cout << "points is: " << pointss[0].x.data[3] << endl; + // cout << "points is: " << pointss[0].y.data[0] << endl; + // cout << "points is: " << pointss[0].y.data[1] << endl; + // cout << "points is: " << pointss[0].y.data[2] << endl; + // cout << "points is: " << pointss[0].y.data[3] << endl; - cout << "points is: " << pointss[1].x.data[0] << endl; - cout << "points is: " << pointss[1].x.data[1] << endl; - cout << "points is: " << pointss[1].x.data[2] << endl; - cout << "points is: " << pointss[1].x.data[3] << endl; - cout << "points is: " << pointss[1].y.data[0] << endl; - cout << "points is: " << pointss[1].y.data[1] << endl; - cout << "points is: " << pointss[1].y.data[2] << endl; - cout << "points is: " << pointss[1].y.data[3] << endl; + // cout << "points is: " << pointss[1].x.data[0] << endl; + // cout << "points is: " << pointss[1].x.data[1] << endl; + // cout << "points is: " << pointss[1].x.data[2] << endl; + // cout << "points is: " << pointss[1].x.data[3] << endl; + // cout << "points is: " << pointss[1].y.data[0] << endl; + // cout << "points is: " << pointss[1].y.data[1] << endl; + // cout << "points is: " << pointss[1].y.data[2] << endl; + // cout << "points is: " << pointss[1].y.data[3] << endl; // Naive implementation for quick testing // msm_t *msm = new msm_t(); @@ -45,7 +45,7 @@ void QueueWrapper::process_queue() { // Context *context = msm->pippenger_initialize(points); // g1_gpu::element *final_result_2 = msm->msm_bucket_method(context, item.mul_scalars, key->reference_string->get_monomials(), 1 << 10); - exit(0); + // exit(0); auto runtime_state = barretenberg::scalar_multiplication::pippenger_runtime_state(key->small_domain.size + 1); diff --git a/src/aztec/gpu/utils/cuda_utils.cuh b/src/aztec/gpu/utils/cuda_utils.cuh new file mode 100644 index 00000000..071e8d89 --- /dev/null +++ b/src/aztec/gpu/utils/cuda_utils.cuh @@ -0,0 +1,32 @@ +#pragma once +#include + +struct cuda_ctx { + int device_id; + cudaMemPool_t mempool; + cudaStream_t stream; + + cuda_ctx(int gpu_id) + { + gpu_id = gpu_id; + cudaMemPoolProps pool_props; + pool_props.allocType = cudaMemAllocationTypePinned; + pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor; + pool_props.location.type = cudaMemLocationTypeDevice; + pool_props.location.id = device_id; + + cudaMemPoolCreate(&mempool, &pool_props); + cudaStreamCreate(&stream); + } + + void set_device() { cudaSetDevice(device_id); } + + void sync_stream() { cudaStreamSynchronize(stream); } + + void malloc(void* ptr, size_t bytesize) { cudaMallocFromPoolAsync(&ptr, bytesize, mempool, stream); } + + void free(void* ptr) { cudaFreeAsync(ptr, stream); } +}; + +// -- Proposed Function Tops -------------------------------------------------- +// ---------------------------------------------------------------------------- diff --git a/src/aztec/gpu/utils/host_math.cuh b/src/aztec/gpu/utils/host_math.cuh new file mode 100644 index 00000000..73922b92 --- /dev/null +++ b/src/aztec/gpu/utils/host_math.cuh @@ -0,0 +1,96 @@ +#pragma once + +#include +#include + +namespace host_math { + + // return x + y with uint32_t operands + static __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; } + + // return x + y + carry with uint32_t operands + static __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; } + + // return x + y and carry out with uint32_t operands + static __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry) + { + uint32_t result; + result = x + y; + carry = x > result; + return result; + } + + // return x + y + carry and carry out with uint32_t operands + static __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry) + { + const uint32_t result = x + y + carry; + carry = carry && x >= result || !carry && x > result; + return result; + } + + // return x - y with uint32_t operands + static __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; } + + // return x - y - borrow with uint32_t operands + static __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; } + + // return x - y and borrow out with uint32_t operands + static __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow) + { + uint32_t result; + result = x - y; + borrow = x < result; + return result; + } + + // return x - y - borrow and borrow out with uint32_t operands + static __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow) + { + const uint32_t result = x - y - borrow; + borrow = borrow && x <= result || !borrow && x < result; + return result; + } + + // return x * y + z + carry and carry out with uint32_t operands + static __host__ uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry) + { + uint32_t result; + uint64_t r = static_cast(x) * y + z + carry; + carry = r >> 32; + result = r & 0xffffffff; + return result; + } + + template + struct carry_chain { + unsigned index; + + constexpr __host__ __forceinline__ carry_chain() : index(0) {} + + __host__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry) + { + index++; + if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT) + return host_math::add(x, y); + else if (index == 1 && !CARRY_IN) + return host_math::add_cc(x, y, carry); + else if (index < OPS_COUNT || CARRY_OUT) + return host_math::addc_cc(x, y, carry); + else + return host_math::addc(x, y, carry); + } + + __host__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry) + { + index++; + if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT) + return host_math::sub(x, y); + else if (index == 1 && !CARRY_IN) + return host_math::sub_cc(x, y, carry); + else if (index < OPS_COUNT || CARRY_OUT) + return host_math::subc_cc(x, y, carry); + else + return host_math::subc(x, y, carry); + } + }; +} // namespace host_math diff --git a/src/aztec/gpu/utils/mont.cuh b/src/aztec/gpu/utils/mont.cuh new file mode 100644 index 00000000..de90b604 --- /dev/null +++ b/src/aztec/gpu/utils/mont.cuh @@ -0,0 +1,27 @@ +#pragma once + +#include "../headers/ve_mod_mult.cuh" + +template +int convert_montgomery(E* d_inout, size_t n_elments, bool is_into, cudaStream_t stream) +{ + // Set the grid and block dimensions + int num_threads = MAX_THREADS_PER_BLOCK; + int num_blocks = (n_elments + num_threads - 1) / num_threads; + E mont = is_into ? E::montgomery_r() : E::montgomery_r_inv(); + template_normalize_kernel<<>>(d_inout, n_elments, mont); + + return 0; // TODO: void with propper error handling +} + +template +int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream) +{ + return convert_montgomery(d_inout, n, true, stream); +} + +template +int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream) +{ + return convert_montgomery(d_inout, n, false, stream); +} \ No newline at end of file diff --git a/src/aztec/gpu/utils/objects.cuh b/src/aztec/gpu/utils/objects.cuh new file mode 100644 index 00000000..f3a5a655 --- /dev/null +++ b/src/aztec/gpu/utils/objects.cuh @@ -0,0 +1,63 @@ +#pragma once +template +class Element +{ +public: + int v; + __device__ __host__ Element() { v = 0; } + __device__ __host__ Element(int r) + { + v = r % F::q; + if (r == F::q) v = F::q; + } + __device__ __host__ Element operator+(Element const& obj) + { + Element res; + res.v = (v + obj.v) % F::q; + return res; + } + __device__ __host__ Element operator-(Element const& obj) + { + Element res; + res.v = (v - obj.v) % F::q; + if (res.v < 0) { res.v = F::q + res.v; } + return res; + } +}; + +template +class Scalar +{ +public: + int v; + __device__ __host__ Scalar() { v = 0; } + __device__ __host__ Scalar(int r) { v = r % F::q; } + __device__ __host__ Scalar operator+(Scalar const& obj) + { + Scalar res; + res.v = (v + obj.v) % F::q; + return res; + } + __device__ __host__ Scalar operator*(Scalar const& obj) + { + Scalar res; + res.v = (v * obj.v) % F::q; + return res; + } + __device__ __host__ Element operator*(Element const& obj) + { + Element res; + res.v = (v * obj.v) % F::q; + return res; + } + Scalar operator-(Scalar const& obj) + { + Scalar res; + res.v = (v - obj.v) % F::q; + if (res.v < 0) { res.v = F::q + res.v; } + return res; + } + bool operator<(Scalar const& obj) { return v < obj.v; } + static Scalar one() { return Scalar(1); } + static Scalar zero() { return Scalar(0); } +}; \ No newline at end of file diff --git a/src/aztec/gpu/utils/ptx.cuh b/src/aztec/gpu/utils/ptx.cuh new file mode 100644 index 00000000..7625bd92 --- /dev/null +++ b/src/aztec/gpu/utils/ptx.cuh @@ -0,0 +1,282 @@ +#pragma once +#include +#include + +namespace ptx { + + __device__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm("add.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t add_cc(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t addc(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm volatile("addc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t addc_cc(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm("sub.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t sub_cc(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm volatile("sub.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t subc(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm volatile("subc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t subc_cc(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm volatile("subc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t mul_lo(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm("mul.lo.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t mul_hi(const uint32_t x, const uint32_t y) + { + uint32_t result; + asm("mul.hi.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y)); + return result; + } + + __device__ __forceinline__ uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z) + { + uint32_t result; + asm("mad.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); + return result; + } + + __device__ __forceinline__ uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z) + { + uint32_t result; + asm("mad.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); + return result; + } + + __device__ __forceinline__ uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z) + { + uint32_t result; + asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); + return result; + } + + __device__ __forceinline__ uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z) + { + uint32_t result; + asm volatile("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); + return result; + } + + __device__ __forceinline__ uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z) + { + uint32_t result; + asm volatile("madc.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); + return result; + } + + __device__ __forceinline__ uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z) + { + uint32_t result; + asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); + return result; + } + + __device__ __forceinline__ uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z) + { + uint32_t result; + asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); + return result; + } + + __device__ __forceinline__ uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z) + { + uint32_t result; + asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); + return result; + } + + __device__ __forceinline__ uint64_t mov_b64(uint32_t lo, uint32_t hi) + { + uint64_t result; + asm("mov.b64 %0, {%1,%2};" : "=l"(result) : "r"(lo), "r"(hi)); + return result; + } + + // Gives u64 overloads a dedicated namespace. + // Callers should know exactly what they're calling (no implicit conversions). + namespace u64 { + + __device__ __forceinline__ uint64_t add(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm("add.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t add_cc(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t addc(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm volatile("addc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t addc_cc(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t sub(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm("sub.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t sub_cc(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t subc(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm volatile("subc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t subc_cc(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t mul_lo(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm("mul.lo.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t mul_hi(const uint64_t x, const uint64_t y) + { + uint64_t result; + asm("mul.hi.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y)); + return result; + } + + __device__ __forceinline__ uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z) + { + uint64_t result; + asm("mad.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z)); + return result; + } + + __device__ __forceinline__ uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z) + { + uint64_t result; + asm("mad.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z)); + return result; + } + + __device__ __forceinline__ uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z) + { + uint64_t result; + asm volatile("mad.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z)); + return result; + } + + __device__ __forceinline__ uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z) + { + uint64_t result; + asm volatile("mad.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z)); + return result; + } + + __device__ __forceinline__ uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z) + { + uint64_t result; + asm volatile("madc.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z)); + return result; + } + + __device__ __forceinline__ uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z) + { + uint64_t result; + asm volatile("madc.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z)); + return result; + } + + __device__ __forceinline__ uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z) + { + uint64_t result; + asm volatile("madc.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z)); + return result; + } + + __device__ __forceinline__ uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z) + { + uint64_t result; + asm volatile("madc.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z)); + return result; + } + + } // namespace u64 + + __device__ __forceinline__ void bar_arrive(const unsigned name, const unsigned count) + { + asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(count) : "memory"); + } + + __device__ __forceinline__ void bar_sync(const unsigned name, const unsigned count) + { + asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(count) : "memory"); + } + +} // namespace ptx \ No newline at end of file diff --git a/src/aztec/gpu/utils/sharedmem.cuh b/src/aztec/gpu/utils/sharedmem.cuh new file mode 100644 index 00000000..9c2639b2 --- /dev/null +++ b/src/aztec/gpu/utils/sharedmem.cuh @@ -0,0 +1,278 @@ +// based on https://leimao.github.io/blog/CUDA-Shared-Memory-Templated-Kernel/ +// may be outdated, but only worked like that + +// ------------------------------------------------------------- +// cuDPP -- CUDA Data Parallel Primitives library +// ------------------------------------------------------------- +// $Revision: 5636 $ +// $Date: 2009-07-02 13:39:38 +1000 (Thu, 02 Jul 2009) $ +// ------------------------------------------------------------- +// This source code is distributed under the terms of license.txt +// in the root directory of this source distribution. +// ------------------------------------------------------------- + +/** + * @file + * sharedmem.h + * + * @brief Shared memory declaration struct for templatized types. + * + * Because dynamically sized shared memory arrays are declared "extern" in CUDA, + * we can't templatize their types directly. To get around this, we declare a + * simple wrapper struct that will declare the extern array with a different + * name depending on the type. This avoids linker errors about multiple + * definitions. + * + * To use dynamically allocated shared memory in a templatized __global__ or + * __device__ function, just replace code like this: + * + *
+ *  template
+ *  __global__ void
+ *  foo( T* d_out, T* d_in)
+ *  {
+ *      // Shared mem size is determined by the host app at run time
+ *      extern __shared__  T sdata[];
+ *      ...
+ *      doStuff(sdata);
+ *      ...
+ *  }
+ * 
+ * + * With this + *
+ *  template
+ *  __global__ void
+ *  foo( T* d_out, T* d_in)
+ *  {
+ *      // Shared mem size is determined by the host app at run time
+ *      SharedMemory smem;
+ *      T* sdata = smem.getPointer();
+ *      ...
+ *      doStuff(sdata);
+ *      ...
+ *  }
+ * 
+ */ + +#ifndef _SHAREDMEM_H_ +#define _SHAREDMEM_H_ + +#include "../headers/curve_config.cuh" +// #include "../curves/bls12_381/curve_config.cuh" +// #include "../curves/bn254/curve_config.cuh" + +/** @brief Wrapper class for templatized dynamic shared memory arrays. + * + * This struct uses template specialization on the type \a T to declare + * a differently named dynamic shared memory array for each type + * (\code extern __shared__ T s_type[] \endcode). + * + * Currently there are specializations for the following types: + * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long, + * \c unsigned long, \c bool, \c float, and \c double. One can also specialize it + * for user defined types. + */ +template +struct SharedMemory { + //! @brief Return a pointer to the runtime-sized shared memory array. + //! @returns Pointer to runtime-sized shared memory array + __device__ T* getPointer() + { + extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types + Error_UnsupportedType(); + return (T*)0; + } + // TODO: Use operator overloading to make this class look like a regular array +}; + +// Following are the specializations for the following types. +// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double +// One could also specialize it for user-defined types. + +template <> +struct SharedMemory { + __device__ int* getPointer() + { + extern __shared__ int s_int[]; + return s_int; + } +}; + +template <> +struct SharedMemory { + __device__ unsigned int* getPointer() + { + extern __shared__ unsigned int s_uint[]; + return s_uint; + } +}; + +template <> +struct SharedMemory { + __device__ char* getPointer() + { + extern __shared__ char s_char[]; + return s_char; + } +}; + +template <> +struct SharedMemory { + __device__ unsigned char* getPointer() + { + extern __shared__ unsigned char s_uchar[]; + return s_uchar; + } +}; + +template <> +struct SharedMemory { + __device__ short* getPointer() + { + extern __shared__ short s_short[]; + return s_short; + } +}; + +template <> +struct SharedMemory { + __device__ unsigned short* getPointer() + { + extern __shared__ unsigned short s_ushort[]; + return s_ushort; + } +}; + +template <> +struct SharedMemory { + __device__ long* getPointer() + { + extern __shared__ long s_long[]; + return s_long; + } +}; + +template <> +struct SharedMemory { + __device__ unsigned long* getPointer() + { + extern __shared__ unsigned long s_ulong[]; + return s_ulong; + } +}; + +template <> +struct SharedMemory { + __device__ long long* getPointer() + { + extern __shared__ long long s_longlong[]; + return s_longlong; + } +}; + +template <> +struct SharedMemory { + __device__ unsigned long long* getPointer() + { + extern __shared__ unsigned long long s_ulonglong[]; + return s_ulonglong; + } +}; + +template <> +struct SharedMemory { + __device__ bool* getPointer() + { + extern __shared__ bool s_bool[]; + return s_bool; + } +}; + +template <> +struct SharedMemory { + __device__ float* getPointer() + { + extern __shared__ float s_float[]; + return s_float; + } +}; + +template <> +struct SharedMemory { + __device__ double* getPointer() + { + extern __shared__ double s_double[]; + return s_double; + } +}; + +template <> +struct SharedMemory { + __device__ uchar4* getPointer() + { + extern __shared__ uchar4 s_uchar4[]; + return s_uchar4; + } +}; + +// template <> +// struct SharedMemory { +// __device__ BLS12_381::scalar_t* getPointer() +// { +// extern __shared__ BLS12_381::scalar_t s_scalar_t_bls12_381[]; +// return s_scalar_t_bls12_381; +// } +// }; + +// template <> +// struct SharedMemory { +// __device__ BLS12_381::projective_t* getPointer() +// { +// extern __shared__ BLS12_381::projective_t s_projective_t_bls12_381[]; +// return s_projective_t_bls12_381; +// } +// }; + +// template <> +// struct SharedMemory { +// __device__ BLS12_377::scalar_t* getPointer() +// { +// extern __shared__ BLS12_377::scalar_t s_scalar_t_bls12_377[]; +// return s_scalar_t_bls12_377; +// } +// }; + +// template <> +// struct SharedMemory { +// __device__ BLS12_377::projective_t* getPointer() +// { +// extern __shared__ BLS12_377::projective_t s_projective_t_bls12_377[]; +// return s_projective_t_bls12_377; +// } +// }; + +template <> +struct SharedMemory { + __device__ BN254::scalar_t* getPointer() + { + extern __shared__ BN254::scalar_t s_scalar_t_bn254[]; + return s_scalar_t_bn254; + } +}; + +template <> +struct SharedMemory { + __device__ BN254::projective_t* getPointer() + { + extern __shared__ BN254::projective_t s_projective_t_bn254[]; + return s_projective_t_bn254; + } +}; +#endif //_SHAREDMEM_H_ + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: \ No newline at end of file diff --git a/src/aztec/gpu/utils/storage.cuh b/src/aztec/gpu/utils/storage.cuh new file mode 100644 index 00000000..c4a56c49 --- /dev/null +++ b/src/aztec/gpu/utils/storage.cuh @@ -0,0 +1,17 @@ +#pragma once +#include + +#define LIMBS_ALIGNMENT(x) ((x) % 4 == 0 ? 16 : ((x) % 2 == 0 ? 8 : 4)) + +template +struct __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) storage +{ + static constexpr unsigned LC = LIMBS_COUNT; + uint32_t limbs[LIMBS_COUNT]; +}; + +template +struct __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) storage_array +{ + storage storages[OMEGAS_COUNT]; +}; \ No newline at end of file