diff --git a/src/aztec/gpu/headers/affine.cuh b/src/aztec/gpu/headers/affine.cuh
new file mode 100644
index 00000000..8bf0bc75
--- /dev/null
+++ b/src/aztec/gpu/headers/affine.cuh
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "./fields.cuh"
+
+template <class FF>
+class Affine
+{
+public:
+  FF x;
+  FF y;
+
+  static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Affine& xs, const Affine& ys)
+  {
+    return (xs.x == ys.x) && (xs.y == ys.y);
+  }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Affine& point)
+  {
+    os << "x: " << point.x << "; y: " << point.y;
+    return os;
+  }
+};
diff --git a/src/aztec/gpu/headers/curve_config.cuh b/src/aztec/gpu/headers/curve_config.cuh
new file mode 100644
index 00000000..1e7b5bbb
--- /dev/null
+++ b/src/aztec/gpu/headers/curve_config.cuh
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "./fields.cuh"
+#include "./projective.cuh"
+#if defined(G2_DEFINED)
+#include "../../primitives/extension_field.cuh"
+#endif
+
+#include "params.cuh"
+
+namespace BN254 {
+  typedef Field<PARAMS_BN254::fp_config> scalar_field_t;
+  typedef scalar_field_t scalar_t;
+  typedef Field<PARAMS_BN254::fq_config> point_field_t;
+  static constexpr point_field_t b = point_field_t{PARAMS_BN254::weierstrass_b};
+  typedef Projective<point_field_t, scalar_field_t, b> projective_t;
+  typedef Affine<point_field_t> affine_t;
+#if defined(G2_DEFINED)
+  typedef ExtensionField<PARAMS_BN254::fq_config> g2_point_field_t;
+  static constexpr g2_point_field_t b_g2 = g2_point_field_t{
+    point_field_t{PARAMS_BN254::weierstrass_b_g2_re}, point_field_t{PARAMS_BN254::weierstrass_b_g2_im}};
+  typedef Projective<g2_point_field_t, scalar_field_t, b_g2> g2_projective_t;
+  typedef Affine<g2_point_field_t> g2_affine_t;
+#endif
+} // namespace BN254
\ No newline at end of file
diff --git a/src/aztec/gpu/headers/extension_field.cuh b/src/aztec/gpu/headers/extension_field.cuh
new file mode 100644
index 00000000..3189469a
--- /dev/null
+++ b/src/aztec/gpu/headers/extension_field.cuh
@@ -0,0 +1,158 @@
+#pragma once
+
+#include "field.cuh"
+
+#define HOST_INLINE        __host__ __forceinline__
+#define DEVICE_INLINE      __device__ __forceinline__
+#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+
+template <typename CONFIG>
+class ExtensionField
+{
+private:
+  typedef typename Field<CONFIG>::Wide FWide;
+
+  struct ExtensionWide {
+    FWide real;
+    FWide imaginary;
+
+    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
+    {
+      return ExtensionWide{xs.real + ys.real, xs.imaginary + ys.imaginary};
+    }
+
+    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
+    {
+      return ExtensionWide{xs.real - ys.real, xs.imaginary - ys.imaginary};
+    }
+  };
+
+public:
+  typedef Field<CONFIG> FF;
+  static constexpr unsigned TLC = 2 * CONFIG::limbs_count;
+
+  FF real;
+  FF imaginary;
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField generator_x()
+  {
+    return ExtensionField{FF{CONFIG::g2_gen_x_re}, FF{CONFIG::g2_gen_x_im}};
+  }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField generator_y()
+  {
+    return ExtensionField{FF{CONFIG::g2_gen_y_re}, FF{CONFIG::g2_gen_y_im}};
+  }
+
+  static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }
+
+  template <unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
+  {
+    return ExtensionField{FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary)};
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const ExtensionField& xs)
+  {
+    os << "{ Real: " << xs.real << " }; { Imaginary: " << xs.imaginary << " }";
+    return os;
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs.real + ys.real, xs.imaginary + ys.imaginary};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    FWide real_prod = FF::mul_wide(xs.real, ys.real);
+    FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
+    FWide prod_of_sums = FF::mul_wide(xs.real + xs.imaginary, ys.real + ys.imaginary);
+    FWide i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
+    i_sq_times_im = CONFIG::i_squared_is_negative ? FWide::neg(i_sq_times_im) : i_sq_times_im;
+    return ExtensionWide{real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
+  {
+    return ExtensionField{
+      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary)};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator*(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    ExtensionWide xy = mul_wide(xs, ys);
+    return reduce(xy);
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    return (xs.real == ys.real) && (xs.imaginary == ys.imaginary);
+  }
+
+  friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
+
+  template <const ExtensionField& multiplier>
+  static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField& xs)
+  {
+    static constexpr FF mul_real = multiplier.real;
+    static constexpr FF mul_imaginary = multiplier.imaginary;
+    const FF xs_real = xs.real;
+    const FF xs_imaginary = xs.imaginary;
+    FF real_prod = FF::template mul_const<mul_real>(xs_real);
+    FF imaginary_prod = FF::template mul_const<mul_imaginary>(xs_imaginary);
+    FF re_im = FF::template mul_const<mul_real>(xs_imaginary);
+    FF im_re = FF::template mul_const<mul_imaginary>(xs_real);
+    FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
+    i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
+    return ExtensionField{real_prod + i_sq_times_im, re_im + im_re};
+  }
+
+  template <uint32_t mutliplier, unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
+  {
+    return {FF::template mul_unsigned<mutliplier>(xs.real), FF::template mul_unsigned<mutliplier>(xs.imaginary)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return xs * xs;
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
+  {
+    return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)};
+  }
+
+  // inverse assumes that xs is nonzero
+  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
+  {
+    ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)};
+    FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(FF::sqr(xs.imaginary));
+    i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
+    // TODO: wide here
+    FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im;
+    return xs_conjugate * ExtensionField{FF::inverse(xs_norm_squared), FF::zero()};
+  }
+};
diff --git a/src/aztec/gpu/headers/fields.cuh b/src/aztec/gpu/headers/fields.cuh
new file mode 100644
index 00000000..4cacc117
--- /dev/null
+++ b/src/aztec/gpu/headers/fields.cuh
@@ -0,0 +1,942 @@
+#pragma once
+
+#include "../utils/host_math.cuh"
+#include "../utils/ptx.cuh"
+#include "../utils/storage.cuh"
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <string>
+
+#define HOST_INLINE        __host__ __forceinline__
+#define DEVICE_INLINE      __device__ __forceinline__
+#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+
+template <class CONFIG>
+class Field
+{
+public:
+  static constexpr unsigned TLC = CONFIG::limbs_count;
+  static constexpr unsigned NBITS = CONFIG::modulus_bit_count;
+
+  static constexpr HOST_DEVICE_INLINE Field zero() { return Field{CONFIG::zero}; }
+
+  static constexpr HOST_DEVICE_INLINE Field one() { return Field{CONFIG::one}; }
+
+  static constexpr HOST_DEVICE_INLINE Field from(uint32_t value)
+  {
+    storage<TLC> scalar;
+    scalar.limbs[0] = value;
+    for (int i = 1; i < TLC; i++) {
+      scalar.limbs[i] = 0;
+    }
+    return Field{scalar};
+  }
+
+  static constexpr HOST_DEVICE_INLINE Field generator_x() { return Field{CONFIG::g1_gen_x}; }
+
+  static constexpr HOST_DEVICE_INLINE Field generator_y() { return Field{CONFIG::g1_gen_y}; }
+
+  static HOST_INLINE Field omega(uint32_t logn)
+  {
+    if (logn == 0) { return Field{CONFIG::one}; }
+
+    if (logn > CONFIG::omegas_count) { throw std::invalid_argument("Field: Invalid omega index"); }
+
+    storage_array<CONFIG::omegas_count, TLC> const omega = CONFIG::omega;
+    return Field{omega.storages[logn - 1]};
+  }
+
+  static HOST_INLINE Field omega_inv(uint32_t logn)
+  {
+    if (logn == 0) { return Field{CONFIG::one}; }
+
+    if (logn > CONFIG::omegas_count) { throw std::invalid_argument("Field: Invalid omega_inv index"); }
+
+    storage_array<CONFIG::omegas_count, TLC> const omega_inv = CONFIG::omega_inv;
+    return Field{omega_inv.storages[logn - 1]};
+  }
+
+  static HOST_INLINE Field inv_log_size(uint32_t logn)
+  {
+    if (logn == 0) { return Field{CONFIG::one}; }
+
+    if (logn > CONFIG::omegas_count) { throw std::invalid_argument("Field: Invalid inv index"); }
+    storage_array<CONFIG::omegas_count, TLC> const inv = CONFIG::inv;
+    return Field{inv.storages[logn - 1]};
+  }
+
+  static constexpr HOST_DEVICE_INLINE Field modulus() { return Field{CONFIG::modulus}; }
+
+  static constexpr HOST_DEVICE_INLINE Field montgomery_r() { return Field{CONFIG::montgomery_r}; }
+
+  static constexpr HOST_DEVICE_INLINE Field montgomery_r_inv() { return Field{CONFIG::montgomery_r_inv}; }
+
+  // private:
+  typedef storage<TLC> ff_storage;
+  typedef storage<2 * TLC> ff_wide_storage;
+
+  static constexpr unsigned slack_bits = 32 * TLC - NBITS;
+
+  struct Wide {
+    ff_wide_storage limbs_storage;
+
+    static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
+    {
+      Field out{};
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+      for (unsigned i = 0; i < TLC; i++)
+        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
+      return out;
+    }
+
+    static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
+    {
+      Field out{};
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+      for (unsigned i = 0; i < TLC; i++) {
+#ifdef __CUDA_ARCH__
+        out.limbs_storage.limbs[i] =
+          __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], slack_bits);
+#else
+        out.limbs_storage.limbs[i] =
+          (xs.limbs_storage.limbs[i + TLC] << slack_bits) + (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - slack_bits));
+#endif
+      }
+      return out;
+    }
+
+    template <unsigned REDUCTION_SIZE = 1>
+    static constexpr HOST_DEVICE_INLINE Wide sub_modulus_squared(const Wide& xs)
+    {
+      if (REDUCTION_SIZE == 0) return xs;
+      const ff_wide_storage modulus = get_modulus_squared<REDUCTION_SIZE>();
+      Wide rs = {};
+      return sub_limbs<true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
+    }
+
+    template <unsigned MODULUS_MULTIPLE = 1>
+    static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs)
+    {
+      const ff_wide_storage modulus = get_modulus_squared<MODULUS_MULTIPLE>();
+      Wide rs = {};
+      sub_limbs<false>(modulus, xs.limbs_storage, rs.limbs_storage);
+      return rs;
+    }
+
+    friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys)
+    {
+      Wide rs = {};
+      add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+      return sub_modulus_squared<1>(rs);
+    }
+
+    friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys)
+    {
+      Wide rs = {};
+      uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+      if (carry == 0) return rs;
+      const ff_wide_storage modulus = get_modulus_squared<1>();
+      add_limbs<false>(rs.limbs_storage, modulus, rs.limbs_storage);
+      return rs;
+    }
+  };
+
+  // return modulus
+  template <unsigned MULTIPLIER = 1>
+  static constexpr HOST_DEVICE_INLINE ff_storage get_modulus()
+  {
+    switch (MULTIPLIER) {
+    case 1:
+      return CONFIG::modulus;
+    case 2:
+      return CONFIG::modulus_2;
+    case 4:
+      return CONFIG::modulus_4;
+    default:
+      return {};
+    }
+  }
+
+  template <unsigned MULTIPLIER = 1>
+  static constexpr HOST_DEVICE_INLINE ff_wide_storage modulus_wide()
+  {
+    return CONFIG::modulus_wide;
+  }
+
+  // return m
+  static constexpr HOST_DEVICE_INLINE ff_storage get_m() { return CONFIG::m; }
+
+  // return modulus^2, helpful for ab +/- cd
+  template <unsigned MULTIPLIER = 1>
+  static constexpr HOST_DEVICE_INLINE ff_wide_storage get_modulus_squared()
+  {
+    switch (MULTIPLIER) {
+    case 1:
+      return CONFIG::modulus_squared;
+    case 2:
+      return CONFIG::modulus_squared_2;
+    case 4:
+      return CONFIG::modulus_squared_4;
+    default:
+      return {};
+    }
+  }
+
+  // add or subtract limbs
+  template <bool SUBTRACT, bool CARRY_OUT>
+  static constexpr DEVICE_INLINE uint32_t
+  add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
+  {
+    const uint32_t* x = xs.limbs;
+    const uint32_t* y = ys.limbs;
+    uint32_t* r = rs.limbs;
+    r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (unsigned i = 1; i < (CARRY_OUT ? TLC : TLC - 1); i++)
+      r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]);
+    if (!CARRY_OUT) {
+      r[TLC - 1] = SUBTRACT ? ptx::subc(x[TLC - 1], y[TLC - 1]) : ptx::addc(x[TLC - 1], y[TLC - 1]);
+      return 0;
+    }
+    return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0);
+  }
+
+  template <bool SUBTRACT, bool CARRY_OUT>
+  static constexpr DEVICE_INLINE uint32_t
+  add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
+  {
+    const uint32_t* x = xs.limbs;
+    const uint32_t* y = ys.limbs;
+    uint32_t* r = rs.limbs;
+    r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (unsigned i = 1; i < (CARRY_OUT ? 2 * TLC : 2 * TLC - 1); i++)
+      r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]);
+    if (!CARRY_OUT) {
+      r[2 * TLC - 1] = SUBTRACT ? ptx::subc(x[2 * TLC - 1], y[2 * TLC - 1]) : ptx::addc(x[2 * TLC - 1], y[2 * TLC - 1]);
+      return 0;
+    }
+    return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0);
+  }
+
+  template <bool SUBTRACT, bool CARRY_OUT>
+  static constexpr HOST_INLINE uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
+  {
+    const uint32_t* x = xs.limbs;
+    const uint32_t* y = ys.limbs;
+    uint32_t* r = rs.limbs;
+    uint32_t carry = 0;
+    host_math::carry_chain<TLC, false, CARRY_OUT> chain;
+    for (unsigned i = 0; i < TLC; i++)
+      r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
+    return CARRY_OUT ? carry : 0;
+  }
+
+  template <bool SUBTRACT, bool CARRY_OUT>
+  static constexpr HOST_INLINE uint32_t
+  add_sub_limbs_host(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
+  {
+    const uint32_t* x = xs.limbs;
+    const uint32_t* y = ys.limbs;
+    uint32_t* r = rs.limbs;
+    uint32_t carry = 0;
+    host_math::carry_chain<2 * TLC, false, CARRY_OUT> chain;
+    for (unsigned i = 0; i < 2 * TLC; i++)
+      r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
+    return CARRY_OUT ? carry : 0;
+  }
+
+  static constexpr HOST_INLINE uint32_t
+  sub_limbs_partial_host(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs)
+  {
+    uint32_t carry = 0;
+    host_math::carry_chain<2 * TLC, false, true> chain;
+    for (unsigned i = 0; i < num_limbs; i++)
+      r[i] = chain.sub(x[i], y[i], carry);
+    return carry;
+  }
+
+  template <bool CARRY_OUT, typename T>
+  static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T& xs, const T& ys, T& rs)
+  {
+#ifdef __CUDA_ARCH__
+    return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
+#else
+    return add_sub_limbs_host<false, CARRY_OUT>(xs, ys, rs);
+#endif
+  }
+
+  template <bool CARRY_OUT, typename T>
+  static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
+  {
+#ifdef __CUDA_ARCH__
+    return add_sub_limbs_device<true, CARRY_OUT>(xs, ys, rs);
+#else
+    return add_sub_limbs_host<true, CARRY_OUT>(xs, ys, rs);
+#endif
+  }
+
+  static DEVICE_INLINE void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  {
+#pragma unroll
+    for (size_t i = 0; i < n; i += 2) {
+      acc[i] = ptx::mul_lo(a[i], bi);
+      acc[i + 1] = ptx::mul_hi(a[i], bi);
+    }
+  }
+
+  static DEVICE_INLINE void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
+  {
+#pragma unroll
+    for (size_t i = start_i; i < n; i += 2) {
+      acc[i] = ptx::mul_lo(a[i], bi);
+      acc[i + 1] = ptx::mul_hi(a[i], bi);
+    }
+  }
+
+  static DEVICE_INLINE void cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  {
+    // multiply scalar by vector
+    // acc = acc + bi*A[::2]
+    acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
+    acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
+#pragma unroll
+    for (size_t i = 2; i < n; i += 2) {
+      acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
+      acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
+    }
+  }
+
+  static DEVICE_INLINE void
+  cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0)
+  {
+    // multiply scalar by vector
+    // acc = acc + bi*A[::2]
+    acc[a_start_idx] = ptx::mad_lo_cc(a[a_start_idx], bi, acc[a_start_idx]);
+    acc[a_start_idx + 1] = ptx::madc_hi_cc(a[a_start_idx], bi, acc[a_start_idx + 1]);
+#pragma unroll
+    for (size_t i = a_start_idx + 2; i < n; i += 2) {
+      acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
+      acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
+    }
+  }
+
+  static DEVICE_INLINE void mad_row(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  {
+    // odd = odd + bi*A
+    // even = even + bi*A
+    cmad_n(odd, a + 1, bi, n - 2);
+    odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
+    odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
+    cmad_n(even, a, bi, n);
+    odd[n - 1] = ptx::addc(odd[n - 1], 0);
+  }
+
+  static DEVICE_INLINE void
+  mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0)
+  {
+    // odd = odd + bi*A
+    // even = even + bi*A
+    cmad_n_msb(odd, a + 1, bi, n - 2, a_start_idx - 1);
+    odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
+    odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
+    cmad_n_msb(even, a, bi, n, a_start_idx);
+    odd[n - 1] = ptx::addc(odd[n - 1], 0);
+  }
+
+  static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+    const uint32_t* a = as.limbs;
+    const uint32_t* b = bs.limbs;
+    uint32_t* even = rs.limbs;
+    __align__(8) uint32_t odd[2 * TLC - 2];
+    mul_n(even, a, b[0]);
+    mul_n(odd, a + 1, b[0]);
+    mad_row(&even[2], &odd[0], a, b[1]);
+    size_t i;
+#pragma unroll
+    for (i = 2; i < TLC - 1; i += 2) {
+      mad_row(&odd[i], &even[i], a, b[i]);
+      mad_row(&even[i + 2], &odd[i], a, b[i + 1]);
+    }
+    // merge |even| and |odd|
+    even[1] = ptx::add_cc(even[1], odd[0]);
+    for (i = 1; i < 2 * TLC - 2; i++)
+      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+    even[i + 1] = ptx::addc(even[i + 1], 0);
+  }
+
+  static DEVICE_INLINE void mult_no_carry(uint32_t a, uint32_t b, uint32_t* r)
+  {
+    r[0] = ptx::mul_lo(a, b);
+    r[1] = ptx::mul_hi(a, b);
+  }
+
+  static DEVICE_INLINE void ingo_multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+    const uint32_t* a = as.limbs;
+    const uint32_t* b = bs.limbs;
+    uint32_t* r = rs.limbs;
+    uint32_t i, j;
+    uint32_t* even = rs.limbs;
+    __align__(8) uint32_t odd[2 * TLC];
+    for (uint32_t i = 0; i < 2 * TLC; i++) {
+      even[i] = 0;
+      odd[i] = 0;
+    }
+    // first row special case, no carry in no carry out. split to non parts, even and odd.
+    for (i = 0; i < TLC - 1; i += 2) {
+      mult_no_carry(b[0], a[i], &even[i]);
+      mult_no_carry(b[0], a[i + 1], &odd[i]);
+    }
+
+    // doing two rows at one loop
+    for (i = 1; i < TLC - 1; i += 2) {
+      // odd bi's
+      // multiply accumulate even part of new row with odd part prev row (needs a carry)
+      // // j = 0, no carry in, only carry out
+      odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
+      odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
+      // for loop carry in carry out
+      for (j = 2; j < TLC; j += 2) // 2, 4, 6
+      {
+        odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
+        odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
+      }
+      odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+
+      // multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
+      // j = 1, no carry in, only carry out
+      even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
+      even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
+      // for loop carry in carry out
+      for (j = 3; j < TLC; j += 2) {
+        even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
+        even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
+      }
+
+      // even bi's
+      // multiply accumulate even part of new row with even part of prev row // needs a carry
+      // j = 0, no carry in, only carry out
+      even[i + 1] = ptx::mad_lo_cc(a[0], b[i + 1], even[i + 1]);
+      even[i + 2] = ptx::madc_hi_cc(a[0], b[i + 1], even[i + 2]);
+      // for loop, carry in, carry out.
+      for (j = 2; j < TLC; j += 2) {
+        even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
+        even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);
+      }
+      even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
+
+      // multiply accumulate odd part of new row with odd part of prev row
+      // j = 1, no carry in, only carry out
+      odd[i + 1] = ptx::mad_lo_cc(a[1], b[i + 1], odd[i + 1]);
+      odd[i + 2] = ptx::madc_hi_cc(a[1], b[i + 1], odd[i + 2]);
+      // for loop, carry in, carry out.
+      for (j = 3; j < TLC; j += 2) {
+        odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
+        odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);
+      }
+    }
+
+    odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
+    odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
+    // for loop carry in carry out
+    for (j = 2; j < TLC; j += 2) {
+      odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
+      odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
+    }
+    odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+
+    // multiply accumulate odd part of new row with even part prev row
+    // j = 1, no carry in, only carry out
+    even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
+    even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
+    // for loop carry in carry out
+    for (j = 3; j < TLC; j += 2) {
+      even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
+      even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
+    }
+
+    // add even and odd parts
+    even[1] = ptx::add_cc(even[1], odd[0]);
+    for (i = 1; i < 2 * TLC - 2; i++)
+      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+    even[i + 1] = ptx::addc(even[i + 1], 0);
+  }
+
+  static DEVICE_INLINE void
+  ingo_msb_multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+    const uint32_t* a = as.limbs;
+    const uint32_t* b = bs.limbs;
+    uint32_t* r = rs.limbs;
+    uint32_t i, j;
+    uint32_t* even = rs.limbs;
+    __align__(8) uint32_t odd[2 * TLC];
+    for (uint32_t i = 0; i < 2 * TLC; i++) {
+      even[i] = 0;
+      odd[i] = 0;
+    }
+    // only last element from first row.
+    mult_no_carry(b[0], a[TLC - 1], &odd[TLC - 2]);
+
+// doing two rows at one loop
+#pragma unroll
+    for (i = 1; i < TLC - 1; i += 2) {
+      const uint32_t first_active_j = TLC - 1 - i;
+      const uint32_t first_active_j_odd = first_active_j + (1 - (first_active_j % 2));
+      const uint32_t first_active_j_even = first_active_j + first_active_j % 2;
+      // odd bi's
+      // multiply accumulate even part of new row with odd part prev row (needs a carry)
+      // j = 0, no carry in, only carry out
+      odd[first_active_j_even + i - 1] = ptx::mad_lo_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i - 1]);
+      odd[first_active_j_even + i] = ptx::madc_hi_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i]);
+// for loop carry in carry out
+#pragma unroll
+      for (j = first_active_j_even + 2; j < TLC; j += 2) {
+        odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
+        odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
+      }
+      odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+
+      // multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
+      // j = 1, no carry in, only carry out
+      even[i + first_active_j_odd] = ptx::mad_lo_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd]);
+      even[i + first_active_j_odd + 1] = ptx::madc_hi_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd + 1]);
+// for loop carry in carry out
+#pragma unroll
+      for (j = first_active_j_odd + 2; j < TLC; j += 2) {
+        even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
+        even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
+      }
+
+      // even bi's
+      uint32_t const first_active_j1 = TLC - 1 - (i + 1);
+      uint32_t const first_active_j_odd1 = first_active_j1 + (1 - (first_active_j1 % 2));
+      uint32_t const first_active_j_even1 = first_active_j1 + first_active_j1 % 2;
+      // multiply accumulate even part of new row with even part of prev row // needs a carry
+      // j = 0, no carry in, only carry out
+      even[first_active_j_even1 + i + 1] =
+        ptx::mad_lo_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 1]);
+      even[first_active_j_even1 + i + 2] =
+        ptx::madc_hi_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 2]);
+// for loop, carry in, carry out.
+#pragma unroll
+      for (j = first_active_j_even1 + 2; j < TLC; j += 2) {
+        even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
+        even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);
+      }
+      even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
+
+      // multiply accumulate odd part of new row with odd part of prev row
+      // j = 1, no carry in, only carry out
+      odd[first_active_j_odd1 + i] = ptx::mad_lo_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i]);
+      odd[first_active_j_odd1 + i + 1] =
+        ptx::madc_hi_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i + 1]);
+// for loop, carry in, carry out.
+#pragma unroll
+      for (j = first_active_j_odd1 + 2; j < TLC; j += 2) {
+        odd[i + j] = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
+        odd[i + j + 1] = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);
+      }
+    }
+
+    // last round, i = TLC - 1
+    odd[i - 1] = ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
+    odd[i] = ptx::madc_hi_cc(a[0], b[i], odd[i]);
+// for loop carry in carry out
+#pragma unroll
+    for (j = 2; j < TLC; j += 2) {
+      odd[i + j - 1] = ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
+      odd[i + j] = ptx::madc_hi_cc(a[j], b[i], odd[i + j]);
+    }
+    odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+
+    // multiply accumulate odd part of new row with even part prev row
+    // j = 1, no carry in, only carry out
+    even[i + 1] = ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
+    even[i + 2] = ptx::madc_hi_cc(a[1], b[i], even[i + 2]);
+// for loop carry in carry out
+#pragma unroll
+    for (j = 3; j < TLC; j += 2) {
+      even[i + j] = ptx::madc_lo_cc(a[j], b[i], even[i + j]);
+      even[i + j + 1] = ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);
+    }
+
+    // add even and odd parts
+    even[1] = ptx::add_cc(even[1], odd[0]);
+#pragma unroll
+    for (i = 1; i < 2 * TLC - 2; i++)
+      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+    even[i + 1] = ptx::addc(even[i + 1], 0);
+  }
+
+  static DEVICE_INLINE void multiply_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+    // r = a * b is correcrt for the first TLC + 1 digits. (not computing from TLC + 1 to 2*TLC - 2).
+    const uint32_t* a = as.limbs;
+    const uint32_t* b = bs.limbs;
+    uint32_t* even = rs.limbs;
+    __align__(8) uint32_t odd[2 * TLC - 2];
+    mul_n(even, a, b[0]);
+    mul_n(odd, a + 1, b[0]);
+    mad_row(&even[2], &odd[0], a, b[1]);
+    size_t i;
+#pragma unroll
+    for (i = 2; i < TLC - 1; i += 2) {
+      mad_row(&odd[i], &even[i], a, b[i], TLC - i + 2);
+      mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC - i + 2);
+    }
+
+    // merge |even| and |odd|
+    even[1] = ptx::add_cc(even[1], odd[0]);
+    for (i = 1; i < TLC + 1; i++)
+      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+    even[i + 1] = ptx::addc(even[i + 1], 0);
+  }
+
+  static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+    const uint32_t* a = as.limbs;
+    const uint32_t* b = bs.limbs;
+    uint32_t* even = rs.limbs;
+    __align__(8) uint32_t odd[2 * TLC - 2];
+    for (int i = 0; i < 2 * TLC - 1; i++) {
+      even[i] = 0;
+      odd[i] = 0;
+    }
+    uint32_t min_indexes_sum = TLC - 1;
+    // only diagonal
+    mul_n_msb(even, a, b[0], TLC, min_indexes_sum);
+    mul_n_msb(odd, a + 1, b[0], TLC, min_indexes_sum - 1);
+    mad_row_msb(&even[2], &odd[0], a, b[1], TLC, min_indexes_sum - 1);
+    size_t i;
+#pragma unroll
+    for (i = 2; i < TLC - 1; i += 2) {
+      mad_row(&odd[i], &even[i], a, b[i]);
+      mad_row(&even[i + 2], &odd[i], a, b[i + 1]);
+    }
+    // merge |even| and |odd|
+    even[1] = ptx::add_cc(even[1], odd[0]);
+    for (i = 1; i < 2 * TLC - 2; i++)
+      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+    even[i + 1] = ptx::addc(even[i + 1], 0);
+  }
+
+  static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+    const uint32_t* a = as.limbs;
+    const uint32_t* b = bs.limbs;
+    uint32_t* r = rs.limbs;
+    for (unsigned i = 0; i < TLC; i++) {
+      uint32_t carry = 0;
+      for (unsigned j = 0; j < TLC; j++)
+        r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
+      r[TLC + i] = carry;
+    }
+  }
+
+  static HOST_DEVICE_INLINE void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+#ifdef __CUDA_ARCH__
+    return multiply_raw_device(as, bs, rs);
+#else
+    return multiply_raw_host(as, bs, rs);
+#endif
+  }
+
+  static HOST_DEVICE_INLINE void multiply_raw_lsb(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+#ifdef __CUDA_ARCH__
+    return multiply_lsb_raw_device(as, bs, rs);
+#else
+    return multiply_raw_host(as, bs, rs);
+#endif
+  }
+
+  static HOST_DEVICE_INLINE void multiply_raw_msb(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  {
+#ifdef __CUDA_ARCH__
+    return multiply_raw_device(as, bs, rs);
+#else
+    return multiply_raw_host(as, bs, rs);
+#endif
+  }
+
+public:
+  ff_storage limbs_storage;
+
+  HOST_DEVICE_INLINE uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }
+
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width)
+  {
+    const uint32_t limb_lsb_idx = (digit_num * digit_width) / 32;
+    const uint32_t shift_bits = (digit_num * digit_width) % 32;
+    unsigned rv = limbs_storage.limbs[limb_lsb_idx] >> shift_bits;
+    if ((shift_bits + digit_width > 32) && (limb_lsb_idx + 1 < TLC)) {
+      rv += limbs_storage.limbs[limb_lsb_idx + 1] << (32 - shift_bits);
+    }
+    rv &= ((1 << digit_width) - 1);
+    return rv;
+  }
+
+  static HOST_INLINE Field rand_host()
+  {
+    std::random_device rd;
+    std::mt19937_64 generator(rd());
+    std::uniform_int_distribution<unsigned> distribution;
+    Field value{};
+    for (unsigned i = 0; i < TLC; i++)
+      value.limbs_storage.limbs[i] = distribution(generator);
+    while (lt(modulus(), value))
+      value = value - modulus();
+    return value;
+  }
+
+  template <unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE Field sub_modulus(const Field& xs)
+  {
+    if (REDUCTION_SIZE == 0) return xs;
+    const ff_storage modulus = get_modulus<REDUCTION_SIZE>();
+    Field rs = {};
+    return sub_limbs<true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const Field& xs)
+  {
+    std::stringstream hex_string;
+    hex_string << std::hex << std::setfill('0');
+
+    for (int i = 0; i < TLC; i++) {
+      hex_string << std::setw(8) << xs.limbs_storage.limbs[i];
+    }
+
+    os << "0x" << hex_string.str();
+    return os;
+  }
+
+  friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys)
+  {
+    Field rs = {};
+    add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+    return sub_modulus<1>(rs);
+  }
+
+  friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys)
+  {
+    Field rs = {};
+    uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+    if (carry == 0) return rs;
+    const ff_storage modulus = get_modulus<1>();
+    add_limbs<false>(rs.limbs_storage, modulus, rs.limbs_storage);
+    return rs;
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE Wide mul_wide(const Field& xs, const Field& ys)
+  {
+    Wide rs = {};
+    multiply_raw(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+    return rs;
+  }
+
+  static constexpr DEVICE_INLINE uint32_t
+  sub_limbs_partial_device(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs)
+  {
+    r[0] = ptx::sub_cc(x[0], y[0]);
+#pragma unroll
+    for (unsigned i = 1; i < num_limbs; i++)
+      r[i] = ptx::subc_cc(x[i], y[i]);
+    return ptx::subc(0, 0);
+  }
+
+  static constexpr HOST_DEVICE_INLINE uint32_t
+  sub_limbs_partial(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs)
+  {
+#ifdef __CUDA_ARCH__
+    return sub_limbs_partial_device(x, y, r, num_limbs);
+#else
+    return sub_limbs_partial_host(x, y, r, num_limbs);
+#endif
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs)
+  {
+    Field xs_hi = Wide::get_higher_with_slack(xs); // xy << slack_bits
+    Wide l = {};
+    multiply_raw_msb(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult
+    Field l_hi = Wide::get_higher_with_slack(l);
+    Wide lp = {};
+    multiply_raw_lsb(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); // LSB mult
+    Wide r_wide = xs - lp;
+    Wide r_wide_reduced = {};
+    for (unsigned i = 0; i < TLC + 1; i++) {
+      uint32_t carry = sub_limbs_partial(
+        r_wide.limbs_storage.limbs, modulus_wide().limbs, r_wide_reduced.limbs_storage.limbs, TLC + 1);
+      if (carry == 0) // continue to reduce
+        r_wide = r_wide_reduced;
+      else // done
+        break;
+    }
+
+    // number of wrap around is bounded by TLC +  1 times.
+    Field r = Wide::get_lower(r_wide);
+    return r;
+  }
+
+  friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
+  {
+    Wide xy = mul_wide(xs, ys); // full mult
+    return reduce(xy);
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys)
+  {
+#ifdef __CUDA_ARCH__
+    const uint32_t* x = xs.limbs_storage.limbs;
+    const uint32_t* y = ys.limbs_storage.limbs;
+    uint32_t limbs_or = x[0] ^ y[0];
+#pragma unroll
+    for (unsigned i = 1; i < TLC; i++)
+      limbs_or |= x[i] ^ y[i];
+    return limbs_or == 0;
+#else
+    for (unsigned i = 0; i < TLC; i++)
+      if (xs.limbs_storage.limbs[i] != ys.limbs_storage.limbs[i]) return false;
+    return true;
+#endif
+  }
+
+  friend HOST_DEVICE_INLINE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
+
+  template <const Field& multiplier>
+  static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
+  {
+    Field mul = multiplier;
+    static bool is_u32 = true;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (unsigned i = 1; i < TLC; i++)
+      is_u32 &= (mul.limbs_storage.limbs[i] == 0);
+
+    if (is_u32) return mul_unsigned<multiplier.limbs_storage.limbs[0], Field>(xs);
+    return mul * xs;
+  }
+
+  template <uint32_t mutliplier, class T, unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs)
+  {
+    T rs = {};
+    T temp = xs;
+    bool is_zero = true;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (unsigned i = 0; i < 32; i++) {
+      if (mutliplier & (1 << i)) {
+        rs = is_zero ? temp : (rs + temp);
+        is_zero = false;
+      }
+      if (mutliplier & ((1 << (31 - i) - 1) << (i + 1))) break;
+      temp = temp + temp;
+    }
+    return rs;
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE Wide sqr_wide(const Field& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE Field sqr(const Field& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return xs * xs;
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE Field neg(const Field& xs)
+  {
+    const ff_storage modulus = get_modulus<MODULUS_MULTIPLE>();
+    Field rs = {};
+    sub_limbs<false>(modulus, xs.limbs_storage, rs.limbs_storage);
+    return rs;
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE Field div2(const Field& xs)
+  {
+    const uint32_t* x = xs.limbs_storage.limbs;
+    Field rs = {};
+    uint32_t* r = rs.limbs_storage.limbs;
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (unsigned i = 0; i < TLC - 1; i++) {
+#ifdef __CUDA_ARCH__
+      r[i] = __funnelshift_rc(x[i], x[i + 1], 1);
+#else
+      r[i] = (x[i] >> 1) | (x[i + 1] << 31);
+#endif
+    }
+    r[TLC - 1] = x[TLC - 1] >> 1;
+    return sub_modulus<MODULUS_MULTIPLE>(rs);
+  }
+
+  static constexpr HOST_DEVICE_INLINE bool lt(const Field& xs, const Field& ys)
+  {
+    ff_storage dummy = {};
+    uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, dummy);
+    return carry;
+  }
+
+  static constexpr HOST_DEVICE_INLINE bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }
+
+  static constexpr HOST_DEVICE_INLINE bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }
+
+  // inverse assumes that xs is nonzero
+  static constexpr HOST_DEVICE_INLINE Field inverse(const Field& xs)
+  {
+    constexpr Field one = Field{CONFIG::one};
+    constexpr ff_storage modulus = CONFIG::modulus;
+    Field u = xs;
+    Field v = Field{modulus};
+    Field b = one;
+    Field c = {};
+    while (!(u == one) && !(v == one)) {
+      while (is_even(u)) {
+        u = div2(u);
+        if (is_odd(b)) add_limbs<false>(b.limbs_storage, modulus, b.limbs_storage);
+        b = div2(b);
+      }
+      while (is_even(v)) {
+        v = div2(v);
+        if (is_odd(c)) add_limbs<false>(c.limbs_storage, modulus, c.limbs_storage);
+        c = div2(c);
+      }
+      if (lt(v, u)) {
+        u = u - v;
+        b = b - c;
+      } else {
+        v = v - u;
+        c = c - b;
+      }
+    }
+    return (u == one) ? b : c;
+  }
+};
diff --git a/src/aztec/gpu/headers/msm.h b/src/aztec/gpu/headers/msm.h
new file mode 100644
index 00000000..88c4bf34
--- /dev/null
+++ b/src/aztec/gpu/headers/msm.h
@@ -0,0 +1,94 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdbool.h>
+// msm.h
+
+#ifndef _BN254_MSM_H
+#define _BN254_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BN254 projective and affine structs
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_g2_projective_t BN254_g2_projective_t;
+typedef struct BN254_affine_t BN254_affine_t;
+typedef struct BN254_g2_affine_t BN254_g2_affine_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+typedef cudaStream_t CudaStream_t;
+
+int msm_cuda_bn254(
+  BN254_projective_t* out, BN254_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bn254(
+  BN254_projective_t* out,
+  BN254_affine_t* points,
+  BN254_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);
+
+int msm_g2_cuda_bn254(
+  BN254_g2_projective_t* out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
+int msm_batch_g2_cuda_bn254(
+  BN254_g2_projective_t* out,
+  BN254_g2_affine_t* points,
+  BN254_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bn254(
+  BN254_g2_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bn254(
+  BN254_g2_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_MSM_H */
diff --git a/src/aztec/gpu/headers/ntt.h b/src/aztec/gpu/headers/ntt.h
new file mode 100644
index 00000000..4ca148e3
--- /dev/null
+++ b/src/aztec/gpu/headers/ntt.h
@@ -0,0 +1,193 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ntt.h
+
+#ifndef _BN254_NTT_H
+#define _BN254_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BN254 projective and affine structs
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_affine_t BN254_affine_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+typedef struct BN254_g2_projective_t BN254_g2_projective_t;
+typedef struct BN254_g2_affine_t BN254_g2_affine_t;
+
+int ntt_cuda_bn254(BN254_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ntt_batch_cuda_bn254(BN254_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bn254(BN254_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bn254(
+  BN254_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+BN254_scalar_t*
+build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int reverse_order_scalars_cuda_bn254(BN254_scalar_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bn254(BN254_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_points_cuda_bn254(BN254_projective_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_points_batch_cuda_bn254(
+  BN254_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bn254(
+  BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bn254(
+  BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
+int to_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
+
+// points g1
+int to_montgomery_proj_points_cuda_bn254(BN254_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_cuda_bn254(BN254_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_cuda_bn254(BN254_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_cuda_bn254(BN254_affine_t* d_inout, unsigned n, size_t stream);
+
+// points g2
+int to_montgomery_proj_points_g2_cuda_bn254(BN254_g2_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_g2_cuda_bn254(BN254_g2_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_g2_cuda_bn254(BN254_g2_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_g2_cuda_bn254(BN254_g2_affine_t* d_inout, unsigned n, size_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_NTT_H */
diff --git a/src/aztec/gpu/headers/params.cuh b/src/aztec/gpu/headers/params.cuh
new file mode 100644
index 00000000..cf7f3e5f
--- /dev/null
+++ b/src/aztec/gpu/headers/params.cuh
@@ -0,0 +1,188 @@
+#pragma once
+#include "../utils/storage.cuh"
+
+namespace PARAMS_BN254 {
+  struct fp_config {
+    static constexpr unsigned limbs_count = 8;
+    static constexpr unsigned omegas_count = 28;
+    static constexpr unsigned modulus_bit_count = 254;
+
+    static constexpr storage<limbs_count> modulus = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848,
+                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    static constexpr storage<limbs_count> modulus_2 = {0xe0000002, 0x87c3eb27, 0xf372e122, 0x5067d090,
+                                                       0x0302b0ba, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
+    static constexpr storage<limbs_count> modulus_4 = {0xc0000004, 0x0f87d64f, 0xe6e5c245, 0xa0cfa121,
+                                                       0x06056174, 0xe14116da, 0x84c680a6, 0xc19139cb};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0xe0000001, 0x08c3eb27, 0xdcb34000, 0xc7f26223, 0x68c9bb7f, 0xffe9a62c, 0xe821ddb0, 0xa6ce1975,
+      0x47b62fe7, 0x2c77527b, 0xd379d3df, 0x85f73bb0, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0xc0000002, 0x1187d64f, 0xb9668000, 0x8fe4c447, 0xd19376ff, 0xffd34c58, 0xd043bb61, 0x4d9c32eb,
+      0x8f6c5fcf, 0x58eea4f6, 0xa6f3a7be, 0x0bee7761, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x80000004, 0x230fac9f, 0x72cd0000, 0x1fc9888f, 0xa326edff, 0xffa698b1, 0xa08776c3, 0x9b3865d7,
+      0x1ed8bf9e, 0xb1dd49ed, 0x4de74f7c, 0x17dceec3, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
+
+    static constexpr storage<limbs_count> m = {0xbe1de925, 0x620703a6, 0x09e880ae, 0x71448520,
+                                               0x68073014, 0xab074a58, 0x623a04a7, 0x54a47462};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695,
+                                                          0x7879462e, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x90ef5a9,
+                                                              0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega = {
+      {{0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
+       {0x8f703636, 0x23120470, 0xfd736bec, 0x5cea24f6, 0x3fd84104, 0x048b6e19, 0xe131a029, 0x30644e72},
+       {0xc1bd5e80, 0x948dad4a, 0xf8170a0a, 0x52627366, 0x96afef36, 0xec9b9e2f, 0xc8c14f22, 0x2b337de1},
+       {0xe306460b, 0xb11509c6, 0x174efb98, 0x996dfbe1, 0x94dd508c, 0x1c6e4f45, 0x16cbbf4e, 0x21082ca2},
+       {0x3bb512d0, 0x3eed4c53, 0x838eeb1d, 0x9c18d51b, 0x47c0b2a9, 0x9678200d, 0x306b93d2, 0x09c532c6},
+       {0x118f023a, 0xdb94fb05, 0x26e324be, 0x46a6cb24, 0x49bdadf2, 0xc24cdb76, 0x5b080fca, 0x1418144d},
+       {0xba9d1811, 0x9d0e470c, 0xb6f24c79, 0x1dcb5564, 0xe85943e0, 0xdf5ce19c, 0xad310991, 0x16e73dfd},
+       {0x74a57a76, 0xc8936191, 0x6750f230, 0x61794254, 0x9f36ffb0, 0xf086204a, 0xa6148404, 0x07b0c561},
+       {0x470157ce, 0x893a7fa1, 0xfc782d75, 0xe8302a41, 0xdd9b0675, 0xffc02c0e, 0xf6e72f5b, 0x0f1ded1e},
+       {0xbc2e5912, 0x11f995e1, 0xa8d2d7ab, 0x39ba79c0, 0xb08771e3, 0xebbebc2b, 0x7017a420, 0x06fd19c1},
+       {0x769a2ee2, 0xd00a58f9, 0x7494f0ca, 0xb8c12c17, 0xa5355d71, 0xb4027fd7, 0x99c5042b, 0x027a3584},
+       {0x0042d43a, 0x1c477572, 0x6f039bb9, 0x76f169c7, 0xfd5a90a9, 0x01ddd073, 0xde2fd10f, 0x0931d596},
+       {0x9bbdd310, 0x4aa49b8d, 0x8e3a2d76, 0xd31bf3e2, 0x78b2667b, 0x001deac8, 0xb869ae62, 0x006fab49},
+       {0x617c6e85, 0xadaa01c2, 0x7420aae6, 0xb4a93ee1, 0x0ddca8a8, 0x1f4e51b8, 0xcdd9e481, 0x2d965651},
+       {0x4e26ecfb, 0xa93458fd, 0x4115a009, 0x022a2a2d, 0x69ec2bd0, 0x017171fa, 0x5941dc91, 0x2d1ba66f},
+       {0xdaac43b7, 0xd1628ba2, 0xe4347e7d, 0x16c8601d, 0xe081dcff, 0x649abebd, 0x5981ed45, 0x00eeb2cb},
+       {0xce8f58e5, 0x276e5858, 0x5655210e, 0x0512eca9, 0xe70e61f3, 0xc3708cc6, 0xa7d74902, 0x1bf82deb},
+       {0x7dcdc0e0, 0x84c6bfa5, 0x13f4d1bd, 0xc57088ff, 0xb5b95e4d, 0x5c0176fb, 0x3a8d46c1, 0x19ddbcaf},
+       {0x613f6cbd, 0x5c1d597f, 0x8357473a, 0x30525841, 0x968e4915, 0x51829353, 0x844bca52, 0x2260e724},
+       {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x06d1e303, 0x27508aba, 0x0a0ed063, 0x26125da1},
+       {0xfcd0b523, 0xb2c87885, 0xca5a5ce3, 0x58f50577, 0x8598fc8c, 0x4222150e, 0xae2bdd1a, 0x1ded8980},
+       {0xa219447e, 0xa76dde56, 0x359eebbb, 0xec1a1f05, 0x8be08215, 0xcda0ceb6, 0xb1f8d9a7, 0x1ad92f46},
+       {0xab80c59d, 0xb54d4506, 0x22dd991f, 0x5680c640, 0xbc23a139, 0x6b7bcf70, 0x5ab4c74d, 0x0210fe63},
+       {0xe32b045b, 0x1c25f1e3, 0x2e832696, 0x145e0db8, 0x71c6441f, 0x852e2a03, 0x845d50d2, 0x0c9fabc7},
+       {0xb878331a, 0xeccd4f3e, 0x8dc6d26e, 0x7b26b748, 0xd9130cd4, 0xa19b0361, 0x326341ef, 0x2a734ebb},
+       {0x2f4e9212, 0x1c79bd57, 0x3d68f9ae, 0x605b52b6, 0xb8d89d4a, 0x0113eff9, 0xf1ff73b2, 0x1067569a},
+       {0x80928c44, 0x034afc45, 0xf6437da2, 0xb4823532, 0x6dc6e364, 0x5f256a9f, 0xb363ebe8, 0x049ae702},
+       {0x725b19f0, 0x9bd61b6e, 0x41112ed4, 0x402d111e, 0x8ef62abc, 0x00e0a7eb, 0xa58a7e85, 0x2a3c09f0}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
+      {{0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
+       {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000},
+       {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711},
+       {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf},
+       {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136},
+       {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2},
+       {0xc64427d7, 0xdddf985f, 0xa49e95bd, 0xaa4f964a, 0x5def8b04, 0x427c045f, 0x7969b732, 0x1641c053},
+       {0x0329f5d6, 0x692c553d, 0x8712848a, 0xa54cf8c6, 0x38e2b5e6, 0x64751ad9, 0x7422fad3, 0x204bd327},
+       {0xaf6b3e4e, 0x52f26c0f, 0xf0bcc0c8, 0x4c277a07, 0xe4fcfcab, 0x546875d5, 0xaa9995b3, 0x09d8f821},
+       {0xb2e5cc71, 0xcaa2e1e9, 0x6e43404e, 0xed42b68e, 0x7a2c7f0a, 0x6ed80915, 0xde3c86d6, 0x1c4042c7},
+       {0x579d71ae, 0x20a3a65d, 0x0adc4420, 0xfd7efed8, 0xfddabf54, 0x3bb6dcd7, 0xbc73d07b, 0x0fa9bb21},
+       {0xc79e0e57, 0xb6f70f8d, 0xa04e05ac, 0x269d3fde, 0x2ba088d9, 0xcf2e371c, 0x11b88d9c, 0x1af864d2},
+       {0xabd95dc9, 0x3b0b205a, 0x978188ca, 0xc8df74fa, 0x6a1cb6c8, 0x08e124db, 0xbfac6104, 0x1670ed58},
+       {0x641c8410, 0xf8eee934, 0x677771c0, 0xf40976b0, 0x558e6e8c, 0x11680d42, 0x06e7e9e9, 0x281c036f},
+       {0xb2dbc0b4, 0xc92a742f, 0x4d384e68, 0xc3f02842, 0x2fa43d0d, 0x22701b6f, 0xe4590b37, 0x05d33766},
+       {0x02d842d4, 0x922d5ac8, 0xc830e4c6, 0x91126414, 0x082f37e0, 0xe92338c0, 0x7fe704e8, 0x0b5d56b7},
+       {0xd96f0d22, 0x20e75251, 0x6bd4e8c9, 0xc01c7f08, 0xf9dd50c4, 0x37d8b00b, 0xc43ca872, 0x244cf010},
+       {0x66c5174c, 0x7a823174, 0x22d5ad70, 0x7dbe118c, 0x111119c5, 0xf8d7c71d, 0x83780e87, 0x036853f0},
+       {0xca535321, 0xd98f9924, 0xe66e6c81, 0x22dbc0ef, 0x664ae1b7, 0xa15cf806, 0xa314fb67, 0x06e402c0},
+       {0xe26c91f3, 0x0852a8fd, 0x3baca626, 0x521f45cb, 0x2c51bfca, 0xab6473bc, 0x2100895f, 0x100c332d},
+       {0xa376d0f0, 0xf5fac783, 0x940797d3, 0x50fd246e, 0x145f5278, 0xab14ecc1, 0x41091b14, 0x19c6dfb8},
+       {0x7faa1396, 0x43dc52e2, 0x4beced23, 0xd437be9d, 0x6d3c38c3, 0xecc11e9c, 0x0c74a876, 0x2eb58439},
+       {0xd69ca83b, 0x811b03e7, 0xa1a6eadf, 0x126a786b, 0x4e2b8e61, 0x1dd75c9f, 0xbda6792b, 0x2165a1a5},
+       {0x110b737b, 0x02e1d4d1, 0xb323a164, 0x7be1488d, 0x9cd06163, 0xa334d317, 0xdb50e9cd, 0x2710c370},
+       {0x9550fe47, 0x45d2f3cb, 0xf6a8efc4, 0x5f43327b, 0xe993ee18, 0x5bcd0d50, 0xb21de952, 0x27f035bd},
+       {0x232e3983, 0x1d63cbae, 0xaa1b58e2, 0xac815161, 0x6aeb019e, 0x531f42a5, 0x03ca2ef5, 0x2dcd51d9},
+       {0x980db869, 0xa8b64ba8, 0xc9718f6c, 0x4c787f72, 0x15d27ced, 0x7746a25a, 0x435a46e9, 0x110bf78f},
+       {0x9d18157e, 0x72394277, 0xfd399d5d, 0xec9d51f8, 0x49d5387f, 0x6117635d, 0x9c229cd5, 0x01b77519}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> inv = {
+      {{0xf8000001, 0xa1f0fac9, 0x3cdcb848, 0x9419f424, 0x40c0ac2e, 0xdc2822db, 0x7098d014, 0x18322739},
+       {0xf4000001, 0xf2e9782e, 0x5b4b146c, 0xde26ee36, 0xe1210245, 0x4a3c3448, 0x28e5381f, 0x244b3ad6},
+       {0x72000001, 0x1b65b6e1, 0x6a82427f, 0x832d6b3f, 0xb1512d51, 0x81463cff, 0x850b6c24, 0x2a57c4a4},
+       {0xb1000001, 0x2fa3d63a, 0xf21dd988, 0x55b0a9c3, 0x196942d7, 0x1ccb415b, 0xb31e8627, 0x2d5e098b},
+       {0x50800001, 0xb9c2e5e7, 0x35eba50c, 0x3ef24906, 0xcd754d9a, 0x6a8dc388, 0x4a281328, 0x2ee12bff},
+       {0xa0400001, 0xfed26dbd, 0x57d28ace, 0xb39318a7, 0xa77b52fb, 0x116f049f, 0x15acd9a9, 0x2fa2bd39},
+       {0xc8200001, 0x215a31a8, 0xe8c5fdb0, 0x6de38077, 0x147e55ac, 0x64dfa52b, 0xfb6f3ce9, 0x300385d5},
+       {0x5c100001, 0xb29e139e, 0x313fb720, 0xcb0bb460, 0xcaffd704, 0x8e97f570, 0x6e506e89, 0x3033ea24},
+       {0x26080001, 0xfb400499, 0x557c93d8, 0xf99fce54, 0xa64097b0, 0xa3741d93, 0xa7c10759, 0x304c1c4b},
+       {0x8b040001, 0x1f90fd16, 0x679b0235, 0x10e9db4e, 0x13e0f807, 0xade231a5, 0x447953c1, 0x3058355f},
+       {0x3d820001, 0x31b97955, 0x70aa3963, 0x1c8ee1cb, 0xcab12832, 0xb3193bad, 0x12d579f5, 0x305e41e9},
+       {0x96c10001, 0x3acdb774, 0xf531d4fa, 0xa2616509, 0x26194047, 0xb5b4c0b2, 0xfa038d0f, 0x3061482d},
+       {0x43608001, 0xbf57d684, 0x3775a2c5, 0x654aa6a9, 0x53cd4c52, 0xb7028334, 0x6d9a969c, 0x3062cb50},
+       {0x19b04001, 0x819ce60c, 0xd89789ab, 0xc6bf4778, 0x6aa75257, 0x37a96475, 0xa7661b63, 0x30638ce1},
+       {0x04d82001, 0x62bf6dd0, 0xa9287d1e, 0x777997e0, 0xf614555a, 0x77fcd515, 0x444bddc6, 0x3063edaa},
+       {0xfa6c1001, 0xd350b1b1, 0x9170f6d7, 0xcfd6c014, 0x3bcad6db, 0x18268d66, 0x92bebef8, 0x30641e0e},
+       {0xf5360801, 0x8b9953a2, 0x859533b4, 0x7c05542e, 0x5ea6179c, 0xe83b698e, 0xb9f82f90, 0x30643640},
+       {0x729b0401, 0xe7bda49b, 0x7fa75222, 0xd21c9e3b, 0x7013b7fc, 0x5045d7a2, 0xcd94e7dd, 0x30644259},
+       {0xb14d8201, 0x15cfcd17, 0xfcb0615a, 0xfd284341, 0x78ca882c, 0x844b0eac, 0x57634403, 0x30644866},
+       {0xd0a6c101, 0xacd8e155, 0x3b34e8f5, 0x12ae15c5, 0x7d25f045, 0x9e4daa31, 0x9c4a7216, 0x30644b6c},
+       {0xe0536081, 0x785d6b74, 0xda772cc3, 0x1d70ff06, 0xff53a451, 0x2b4ef7f3, 0xbebe0920, 0x30644cef},
+       {0x6829b041, 0x5e1fb084, 0xaa184eaa, 0x22d273a7, 0x406a7e57, 0xf1cf9ed5, 0x4ff7d4a4, 0x30644db1},
+       {0x2c14d821, 0xd100d30c, 0x11e8df9d, 0x25832df8, 0xe0f5eb5a, 0x550ff245, 0x1894ba67, 0x30644e12},
+       {0x0e0a6c11, 0x8a716450, 0x45d12817, 0xa6db8b20, 0x313ba1db, 0x86b01bfe, 0x7ce32d48, 0x30644e42},
+       {0xff053609, 0x6729acf1, 0x5fc54c54, 0x6787b9b4, 0x595e7d1c, 0x1f8030da, 0xaf0a66b9, 0x30644e5a},
+       {0xf7829b05, 0xd585d142, 0x6cbf5e72, 0xc7ddd0fe, 0x6d6feabc, 0x6be83b48, 0xc81e0371, 0x30644e66},
+       {0x73c14d83, 0x0cb3e36b, 0x733c6782, 0xf808dca3, 0x7778a18c, 0x921c407f, 0xd4a7d1cd, 0x30644e6c},
+       {0xb1e0a6c2, 0xa84aec7f, 0xf67aec09, 0x101e6275, 0xfc7cfcf5, 0xa536431a, 0xdaecb8fb, 0x30644e6f}}};
+  };
+
+  struct fq_config {
+    static constexpr unsigned limbs_count = 8;
+    static constexpr unsigned modulus_bit_count = 254;
+    static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
+                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522,
+                                                       0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
+    static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45,
+                                                       0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95,
+      0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a,
+      0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55,
+      0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
+    static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17,
+                                               0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0xa78eb28,
+                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x14afa37,  0xed84884a, 0x278edf8,  0xeb202285,
+                                                              0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
+
+    // i^2, the square of the imaginary unit for the extension field
+    static constexpr uint32_t i_squared = 1;
+    // true if i^2 is negative
+    static constexpr bool i_squared_is_negative = true;
+    // G1 and G2 generators
+    static constexpr storage<limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                      0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
+                                                      0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4,
+                                                         0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
+    static constexpr storage<limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933,
+                                                         0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
+    static constexpr storage<limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769,
+                                                         0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
+    static constexpr storage<limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133,
+                                                         0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
+  };
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x24a138e5, 0x3267e6dc, 0x59dbefa3, 0xb5b4c5e5, 0x1be06ac3, 0x81be1899, 0xceb8aaae, 0x2b149d40};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0};
+} // namespace PARAMS_BN254
diff --git a/src/aztec/gpu/headers/projective.cuh b/src/aztec/gpu/headers/projective.cuh
new file mode 100644
index 00000000..41cfb77a
--- /dev/null
+++ b/src/aztec/gpu/headers/projective.cuh
@@ -0,0 +1,166 @@
+#pragma once
+
+#include "affine.cuh"
+
+template <typename FF, class SCALAR_FF, const FF& B_VALUE>
+class Projective
+{
+  friend Affine<FF>;
+
+public:
+  FF x;
+  FF y;
+  FF z;
+
+  static HOST_DEVICE_INLINE Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }
+
+  static HOST_DEVICE_INLINE Affine<FF> to_affine(const Projective& point)
+  {
+    FF denom = FF::inverse(point.z);
+    return {point.x * denom, point.y * denom};
+  }
+
+  static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point) { return {point.x, point.y, FF::one()}; }
+
+  static HOST_DEVICE_INLINE Projective generator() { return {FF::generator_x(), FF::generator_y(), FF::one()}; }
+
+  static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
+
+  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
+  {
+    const FF X1 = p1.x;                                                                //                   < 2
+    const FF Y1 = p1.y;                                                                //                   < 2
+    const FF Z1 = p1.z;                                                                //                   < 2
+    const FF X2 = p2.x;                                                                //                   < 2
+    const FF Y2 = p2.y;                                                                //                   < 2
+    const FF Z2 = p2.z;                                                                //                   < 2
+    const FF t00 = X1 * X2;                                                            // t00 ← X1 · X2     < 2
+    const FF t01 = Y1 * Y2;                                                            // t01 ← Y1 · Y2     < 2
+    const FF t02 = Z1 * Z2;                                                            // t02 ← Z1 · Z2     < 2
+    const FF t03 = X1 + Y1;                                                            // t03 ← X1 + Y1     < 4
+    const FF t04 = X2 + Y2;                                                            // t04 ← X2 + Y2     < 4
+    const FF t05 = t03 * t04;                                                          // t03 ← t03 · t04   < 3
+    const FF t06 = t00 + t01;                                                          // t06 ← t00 + t01   < 4
+    const FF t07 = t05 - t06;                                                          // t05 ← t05 − t06   < 2
+    const FF t08 = Y1 + Z1;                                                            // t08 ← Y1 + Z1     < 4
+    const FF t09 = Y2 + Z2;                                                            // t09 ← Y2 + Z2     < 4
+    const FF t10 = t08 * t09;                                                          // t10 ← t08 · t09   < 3
+    const FF t11 = t01 + t02;                                                          // t11 ← t01 + t02   < 4
+    const FF t12 = t10 - t11;                                                          // t12 ← t10 − t11   < 2
+    const FF t13 = X1 + Z1;                                                            // t13 ← X1 + Z1     < 4
+    const FF t14 = X2 + Z2;                                                            // t14 ← X2 + Z2     < 4
+    const FF t15 = t13 * t14;                                                          // t15 ← t13 · t14   < 3
+    const FF t16 = t00 + t02;                                                          // t16 ← t00 + t02   < 4
+    const FF t17 = t15 - t16;                                                          // t17 ← t15 − t16   < 2
+    const FF t18 = t00 + t00;                                                          // t18 ← t00 + t00   < 2
+    const FF t19 = t18 + t00;                                                          // t19 ← t18 + t00   < 2
+    const FF t20 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t02)); // t20 ← b3 · t02    < 2
+    const FF t21 = t01 + t20;                                                          // t21 ← t01 + t20   < 2
+    const FF t22 = t01 - t20;                                                          // t22 ← t01 − t20   < 2
+    const FF t23 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t17)); // t23 ← b3 · t17    < 2
+    const auto t24 = FF::mul_wide(t12, t23);                                           // t24 ← t12 · t23   < 2
+    const auto t25 = FF::mul_wide(t07, t22);                                           // t25 ← t07 · t22   < 2
+    const FF X3 = FF::reduce(t25 - t24);                                               // X3 ← t25 − t24    < 2
+    const auto t27 = FF::mul_wide(t23, t19);                                           // t27 ← t23 · t19   < 2
+    const auto t28 = FF::mul_wide(t22, t21);                                           // t28 ← t22 · t21   < 2
+    const FF Y3 = FF::reduce(t28 + t27);                                               // Y3 ← t28 + t27    < 2
+    const auto t30 = FF::mul_wide(t19, t07);                                           // t30 ← t19 · t07   < 2
+    const auto t31 = FF::mul_wide(t21, t12);                                           // t31 ← t21 · t12   < 2
+    const FF Z3 = FF::reduce(t31 + t30);                                               // Z3 ← t31 + t30    < 2
+    return {X3, Y3, Z3};
+  }
+
+  friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }
+
+  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2)
+  {
+    const FF X1 = p1.x;                                                                //                   < 2
+    const FF Y1 = p1.y;                                                                //                   < 2
+    const FF Z1 = p1.z;                                                                //                   < 2
+    const FF X2 = p2.x;                                                                //                   < 2
+    const FF Y2 = p2.y;                                                                //                   < 2
+    const FF t00 = X1 * X2;                                                            // t00 ← X1 · X2     < 2
+    const FF t01 = Y1 * Y2;                                                            // t01 ← Y1 · Y2     < 2
+    const FF t02 = Z1;                                                                 // t02 ← Z1          < 2
+    const FF t03 = X1 + Y1;                                                            // t03 ← X1 + Y1     < 4
+    const FF t04 = X2 + Y2;                                                            // t04 ← X2 + Y2     < 4
+    const FF t05 = t03 * t04;                                                          // t03 ← t03 · t04   < 3
+    const FF t06 = t00 + t01;                                                          // t06 ← t00 + t01   < 4
+    const FF t07 = t05 - t06;                                                          // t05 ← t05 − t06   < 2
+    const FF t08 = Y1 + Z1;                                                            // t08 ← Y1 + Z1     < 4
+    const FF t09 = Y2 + FF::one();                                                     // t09 ← Y2 + 1      < 4
+    const FF t10 = t08 * t09;                                                          // t10 ← t08 · t09   < 3
+    const FF t11 = t01 + t02;                                                          // t11 ← t01 + t02   < 4
+    const FF t12 = t10 - t11;                                                          // t12 ← t10 − t11   < 2
+    const FF t13 = X1 + Z1;                                                            // t13 ← X1 + Z1     < 4
+    const FF t14 = X2 + FF::one();                                                     // t14 ← X2 + 1      < 4
+    const FF t15 = t13 * t14;                                                          // t15 ← t13 · t14   < 3
+    const FF t16 = t00 + t02;                                                          // t16 ← t00 + t02   < 4
+    const FF t17 = t15 - t16;                                                          // t17 ← t15 − t16   < 2
+    const FF t18 = t00 + t00;                                                          // t18 ← t00 + t00   < 2
+    const FF t19 = t18 + t00;                                                          // t19 ← t18 + t00   < 2
+    const FF t20 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t02)); // t20 ← b3 · t02    < 2
+    const FF t21 = t01 + t20;                                                          // t21 ← t01 + t20   < 2
+    const FF t22 = t01 - t20;                                                          // t22 ← t01 − t20   < 2
+    const FF t23 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t17)); // t23 ← b3 · t17    < 2
+    const auto t24 = FF::mul_wide(t12, t23);                                           // t24 ← t12 · t23   < 2
+    const auto t25 = FF::mul_wide(t07, t22);                                           // t25 ← t07 · t22   < 2
+    const FF X3 = FF::reduce(t25 - t24);                                               // X3 ← t25 − t24    < 2
+    const auto t27 = FF::mul_wide(t23, t19);                                           // t27 ← t23 · t19   < 2
+    const auto t28 = FF::mul_wide(t22, t21);                                           // t28 ← t22 · t21   < 2
+    const FF Y3 = FF::reduce(t28 + t27);                                               // Y3 ← t28 + t27    < 2
+    const auto t30 = FF::mul_wide(t19, t07);                                           // t30 ← t19 · t07   < 2
+    const auto t31 = FF::mul_wide(t21, t12);                                           // t31 ← t21 · t12   < 2
+    const FF Z3 = FF::reduce(t31 + t30);                                               // Z3 ← t31 + t30    < 2
+    return {X3, Y3, Z3};
+  }
+
+  friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine<FF>& p2)
+  {
+    return p1 + Affine<FF>::neg(p2);
+  }
+
+  friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point)
+  {
+    Projective res = zero();
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+    for (int i = 0; i < SCALAR_FF::NBITS; i++) {
+      if (i > 0) { res = res + res; }
+      if (scalar.get_scalar_digit(SCALAR_FF::NBITS - i - 1, 1)) { res = res + point; }
+    }
+    return res;
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Projective& p1, const Projective& p2)
+  {
+    return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z);
+  }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point)
+  {
+    os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
+    return os;
+  }
+
+  static HOST_DEVICE_INLINE bool is_zero(const Projective& point)
+  {
+    return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero();
+  }
+
+  static HOST_DEVICE_INLINE bool is_on_curve(const Projective& point)
+  {
+    if (is_zero(point)) return true;
+    bool eq_holds =
+      (FF::template mul_const<B_VALUE>(FF::sqr(point.z) * point.z) + FF::sqr(point.x) * point.x ==
+       point.z * FF::sqr(point.y));
+    return point.z != FF::zero() && eq_holds;
+  }
+
+  static HOST_INLINE Projective rand_host()
+  {
+    SCALAR_FF rand_scalar = SCALAR_FF::rand_host();
+    return rand_scalar * generator();
+  }
+};
diff --git a/src/aztec/gpu/headers/projective.h b/src/aztec/gpu/headers/projective.h
new file mode 100644
index 00000000..0a5ec74f
--- /dev/null
+++ b/src/aztec/gpu/headers/projective.h
@@ -0,0 +1,50 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// projective.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BN254_projective_t BN254_projective_t;
+// typedef struct BN254_g2_projective_t BN254_g2_projective_t;
+typedef struct BN254_affine_t BN254_affine_t;
+typedef struct BN254_g2_affine_t BN254_g2_affine_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+bool projective_is_on_curve_bn254(BN254_projective_t* point1);
+
+int random_scalar_bn254(BN254_scalar_t* out);
+int random_projective_bn254(BN254_projective_t* out);
+BN254_projective_t* projective_zero_bn254();
+int projective_to_affine_bn254(BN254_affine_t* out, BN254_projective_t* point1);
+int projective_from_affine_bn254(BN254_projective_t* out, BN254_affine_t* point1);
+
+int random_g2_projective_bn254(BN254_g2_projective_t* out);
+int g2_projective_to_affine_bn254(BN254_g2_affine_t* out, BN254_g2_projective_t* point1);
+int g2_projective_from_affine_bn254(BN254_g2_projective_t* out, BN254_g2_affine_t* point1);
+bool g2_projective_is_on_curve_bn254(BN254_g2_projective_t* point1);
+
+bool eq_bn254(BN254_projective_t* point1, BN254_projective_t* point2);
+bool eq_g2_bn254(BN254_g2_projective_t* point1, BN254_g2_projective_t* point2);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/aztec/gpu/headers/ve_mod_mult.cuh b/src/aztec/gpu/headers/ve_mod_mult.cuh
new file mode 100644
index 00000000..36859c49
--- /dev/null
+++ b/src/aztec/gpu/headers/ve_mod_mult.cuh
@@ -0,0 +1,135 @@
+#ifndef VEC_MULT
+#define VEC_MULT
+#pragma once
+#include <cuda.h>
+#include <stdexcept>
+
+#define MAX_THREADS_PER_BLOCK 256
+
+/**
+ * Multiply the elements of an input array by a scalar in-place.
+ * @param arr input array.
+ * @param n size of arr.
+ * @param n_inv scalar of type S (scalar).
+ */
+template <typename E, typename S>
+__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar)
+{
+  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid < n) { arr[tid] = scalar * arr[tid]; }
+}
+
+// TODO: headers for prototypes and .c .cpp .cu files for implementations
+template <typename E, typename S>
+__global__ void vectorModMult(S* scalar_vec, E* element_vec, E* result, size_t n_elments)
+{
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n_elments) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
+}
+
+template <typename E, typename S>
+int vector_mod_mult(S* vec_a, E* vec_b, E* result, size_t n_elments, cudaStream_t stream) // TODO: in place so no need
+                                                                                          // for third result vector
+{
+  // Set the grid and block dimensions
+  int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+  int threads_per_block = MAX_THREADS_PER_BLOCK;
+
+  // Allocate memory on the device for the input vectors, the output vector, and the modulus
+  S* d_vec_a;
+  E *d_vec_b, *d_result;
+  cudaMallocAsync(&d_vec_a, n_elments * sizeof(S), stream);
+  cudaMallocAsync(&d_vec_b, n_elments * sizeof(E), stream);
+  cudaMallocAsync(&d_result, n_elments * sizeof(E), stream);
+
+  // Copy the input vectors and the modulus from the host to the device
+  cudaMemcpyAsync(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice, stream);
+
+  // Call the kernel to perform element-wise modular multiplication
+  vectorModMult<<<num_blocks, threads_per_block, 0, stream>>>(d_vec_a, d_vec_b, d_result, n_elments);
+
+  cudaMemcpyAsync(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost, stream);
+
+  cudaFreeAsync(d_vec_a, stream);
+  cudaFreeAsync(d_vec_b, stream);
+  cudaFreeAsync(d_result, stream);
+
+  cudaStreamSynchronize(stream);
+  return 0;
+}
+
+template <typename E, typename S>
+int vector_mod_mult_device(
+  S* d_vec_a, E* d_vec_b, E* d_result, size_t n_elments) // TODO: in place so no need for third result vector
+{
+  // Set the grid and block dimensions
+  int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+  int threads_per_block = MAX_THREADS_PER_BLOCK;
+
+  // Call the kernel to perform element-wise modular multiplication
+  vectorModMult<<<num_blocks, threads_per_block>>>(d_vec_a, d_vec_b, d_result, n_elments);
+  return 0;
+}
+
+template <typename E, typename S>
+__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size)
+{
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n_scalars * batch_size) {
+    int scalar_id = tid % n_scalars;
+    element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
+  }
+}
+
+template <typename E, typename S>
+int batch_vector_mult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size, cudaStream_t stream)
+{
+  // Set the grid and block dimensions
+  int NUM_THREADS = MAX_THREADS_PER_BLOCK;
+  int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS;
+  batchVectorMult<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(scalar_vec, element_vec, n_scalars, batch_size);
+  return 0;
+}
+
+template <typename E>
+__global__ void matrixVectorMult(E* matrix_elements, E* vector_elements, E* result, size_t dim)
+{
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < dim) {
+    result[tid] = E::zero();
+    for (int i = 0; i < dim; i++)
+      result[tid] = result[tid] + matrix_elements[tid * dim + i] * vector_elements[i];
+  }
+}
+
+template <typename E>
+int matrix_mod_mult(E* matrix_elements, E* vector_elements, E* result, size_t dim, cudaStream_t stream)
+{
+  // Set the grid and block dimensions
+  int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK);
+  int threads_per_block = MAX_THREADS_PER_BLOCK;
+
+  // Allocate memory on the device for the input vectors, the output vector, and the modulus
+  E *d_matrix, *d_vector, *d_result;
+  cudaMallocAsync(&d_matrix, (dim * dim) * sizeof(E), stream);
+  cudaMallocAsync(&d_vector, dim * sizeof(E), stream);
+  cudaMallocAsync(&d_result, dim * sizeof(E), stream);
+
+  // Copy the input vectors and the modulus from the host to the device
+  cudaMemcpyAsync(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice, stream);
+
+  // Call the kernel to perform element-wise modular multiplication
+  matrixVectorMult<<<num_blocks, threads_per_block, 0, stream>>>(d_matrix, d_vector, d_result, dim);
+
+  cudaMemcpyAsync(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost, stream);
+
+  cudaFreeAsync(d_matrix, stream);
+  cudaFreeAsync(d_vector, stream);
+  cudaFreeAsync(d_result, stream);
+
+  cudaStreamSynchronize(stream);
+  return 0;
+}
+#endif
\ No newline at end of file
diff --git a/src/aztec/gpu/headers/ve_mod_mult.h b/src/aztec/gpu/headers/ve_mod_mult.h
new file mode 100644
index 00000000..fcd5753c
--- /dev/null
+++ b/src/aztec/gpu/headers/ve_mod_mult.h
@@ -0,0 +1,45 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ve_mod_mult.h
+
+#ifndef _BN254_VEC_MULT_H
+#define _BN254_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+int32_t
+vec_mod_mult_point_bn254(BN254_projective_t* inout, BN254_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t
+vec_mod_mult_scalar_bn254(BN254_scalar_t* inout, BN254_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bn254(
+  BN254_scalar_t* inout, BN254_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bn254(
+  BN254_scalar_t* matrix_flattened, BN254_scalar_t* input, BN254_scalar_t* output, size_t n_elments, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_VEC_MULT_H */
diff --git a/src/aztec/gpu/libbn254.so b/src/aztec/gpu/libbn254.so
new file mode 100755
index 00000000..b9d7beb7
Binary files /dev/null and b/src/aztec/gpu/libbn254.so differ
diff --git a/src/aztec/gpu/libbn254_old.so b/src/aztec/gpu/libbn254_old.so
new file mode 100755
index 00000000..a42d2493
Binary files /dev/null and b/src/aztec/gpu/libbn254_old.so differ
diff --git a/src/aztec/gpu/msm/CMakeLists.txt b/src/aztec/gpu/msm/CMakeLists.txt
index 8c04f9ca..e7a2f0b4 100644
--- a/src/aztec/gpu/msm/CMakeLists.txt
+++ b/src/aztec/gpu/msm/CMakeLists.txt
@@ -3,10 +3,13 @@ project(cuda-barretenberg CUDA)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 
+link_directories(/home/ubuntu/Cuda-Barretenberg/src/aztec/gpu)
+
 include_directories(../bignum)
 include_directories(../fields)
 include_directories(../curves)
 include_directories(../srs)
+include_directories(../headers)
 
 # Pippenger's bucket method
 add_executable(
@@ -23,6 +26,8 @@ target_link_libraries(
     reference_string_library
     stdlib_primitives
     env
+    bn254
+    dl
 )
 
 add_custom_target(
@@ -42,4 +47,5 @@ target_link_libraries(
     field_library
     curve_library
     reference_string_library
-)
\ No newline at end of file
+)
+
diff --git a/src/aztec/gpu/msm/common.cuh b/src/aztec/gpu/msm/common.cuh
index 3375abee..312830de 100644
--- a/src/aztec/gpu/msm/common.cuh
+++ b/src/aztec/gpu/msm/common.cuh
@@ -1,5 +1,5 @@
 #include "reference_string.cu"
-#include "util/thread_pool.hpp"
+// #include "util/thread_pool.hpp"
 #include "error.cuh"
 #include <cuda.h>
 #include <cub/device/device_radix_sort.cuh>
diff --git a/src/aztec/gpu/msm/main.cu b/src/aztec/gpu/msm/main.cu
index 747e74bb..3d0ccdca 100644
--- a/src/aztec/gpu/msm/main.cu
+++ b/src/aztec/gpu/msm/main.cu
@@ -1,48 +1,110 @@
-#include "pippenger.cu"
+// #include "pippenger.cu"
 #include <chrono>
 #include <common/assert.hpp>
 #include <cstdlib>
-#include <ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp>
-#include <plonk/reference_string/file_reference_string.hpp>
-#include <polynomials/polynomial_arithmetic.hpp>
+// #include <ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp>
+// #include <plonk/reference_string/file_reference_string.hpp>
+// #include <polynomials/polynomial_arithmetic.hpp>
 #include <iostream>
 #include <fstream>
 
+#include "msm.h"
+#include "ntt.h"
+#include "projective.h"
+#include "ve_mod_mult.h"
+
+#include "affine.cuh"
+#include "curve_config.cuh"
+#include "fields.cuh"
+#include "params.cuh"
+#include "../utils/storage.cuh"
+#include "../utils/sharedmem.cuh"
+#include "../utils/ptx.cuh"
+#include "../utils/objects.cuh"
+#include "../utils/mont.cuh"
+#include "../utils/host_math.cuh"
+#include "../utils/cuda_utils.cuh"
+
+#include <dlfcn.h>
+
 using namespace std;
-using namespace pippenger_common;
-using namespace waffle;
-using namespace barretenberg;
+// using namespace pippenger_common;
+// using namespace waffle;
+// using namespace barretenberg;
+
+// std::ostream& operator<<(std::ostream& os, const BN254_projective_t& point)
+// {
+// os << "Point { x: " << point << "; y: " << point << "; z: " << point << " }";
+// return os;
+// }
 
 int main(int, char**) {
-    // Initialize dynamic 'msm_t' object 
-    msm_t<point_t, scalar_t> *msm = new msm_t<point_t, scalar_t>();
     
-    // Construct elliptic curve points from SRS
-    auto reference_string = std::make_shared<waffle::FileReferenceString>(NUM_POINTS, "../srs_db/ignition");
-    g1::affine_element* points = reference_string->get_monomials();
-
-    // Construct random scalars 
-    std::vector<fr> scalars;
-    scalars.reserve(NUM_POINTS);
-    for (size_t i = 0; i < NUM_POINTS; ++i) {
-        scalars.emplace_back(fr::random_element());
+    void* lib_handle = dlopen("/home/ubuntu/Cuda-Barretenberg/src/aztec/gpu/libbn254.so", RTLD_LAZY);
+
+    if (!lib_handle) {
+        std::cerr << "Error loading shared library: " << dlerror() << std::endl;
+        return 1;
     }
 
+    // Projective t {[1,2,3,4,5,6,7,8], [1,2,3,4,5,6,7,8], [1,2,3,4,5,6,7,8]};
+
+    typedef int (*AddFunction)(BN254_projective_t *);
+    AddFunction add_func = reinterpret_cast<AddFunction>(dlsym(lib_handle, "projective_is_on_curve_bn254"));
+
+    if (!add_func) {
+        std::cerr << "Error loading function: " << dlerror() << std::endl;
+        dlclose(lib_handle); // Close the library
+        return 1;
+    }
+
+    BN254_projective_t *t;
+    std::cout << "Result of add(): " << add_func(t) << std::endl;
+
+    dlclose(lib_handle);
+
+    exit(0);
+    
+    // Initialize dynamic 'msm_t' object 
+    // msm_t<point_t, scalar_t> *msm = new msm_t<point_t, scalar_t>();
+    
+    // // Construct elliptic curve points from SRS
+    // auto reference_string = std::make_shared<waffle::FileReferenceString>(NUM_POINTS, "../srs_db/ignition");
+    // g1::affine_element* points = reference_string->get_monomials();
+
+    // // Construct random scalars 
+    // std::vector<fr> scalars;
+    // scalars.reserve(NUM_POINTS);
+    // for (size_t i = 0; i < NUM_POINTS; ++i) {
+    //     scalars.emplace_back(fr::random_element());
+    // }
+
+    // g1::affine_element* icicle_result;
+    // msm_cuda_bn254(icicle_result, points, &scalars[0], scalars.size(), 0);
+
     // Number of streams
-    int num_streams = 1;
+    // int num_streams = 1;
+
+    // // Initialize dynamic pippenger 'context' object
+    // Context<point_t, scalar_t> *context = msm->pippenger_initialize(points, &scalars[0], num_streams, NUM_POINTS);
+
+    // // Execute "Double-And-Add" reference kernel
+    // g1_gpu::element *result_1 = msm->msm_double_and_add(context, NUM_POINTS, points, &scalars[0]);
+
+    // // Execute "Pippenger's Bucket Method" kernel
+    // g1_gpu::element **result_2 = msm->msm_bucket_method(context, points, &scalars[0], num_streams);
 
-    // Initialize dynamic pippenger 'context' object
-    Context<point_t, scalar_t> *context = msm->pippenger_initialize(points, &scalars[0], num_streams, NUM_POINTS);
+    // // Print results 
+    // context->pipp.print_result(result_1, result_2);
 
-    // Execute "Double-And-Add" reference kernel
-    g1_gpu::element *result_1 = msm->msm_double_and_add(context, NUM_POINTS, points, &scalars[0]);
+    // // Verify the final results are equal
+    // context->pipp.verify_result(result_1, result_2);
+}
 
-    // Execute "Pippenger's Bucket Method" kernel
-    g1_gpu::element **result_2 = msm->msm_bucket_method(context, points, &scalars[0], num_streams);
 
-    // Print results 
-    context->pipp.print_result(result_1, result_2);
+// TODO: figure out how to create an instance of object (x, y, z)
+// TODO: Figure out if problem is with shared library or something else leading to type resolving problems
+// TODO: Once that works, see if allocating memory solves our problems
 
-    // Verify the final results are equal
-    context->pipp.verify_result(result_1, result_2);
-}
\ No newline at end of file
+// TODO: Any changes to polynoial IOP wrapper that calls the MSM code
+// TODO: Correctness checks 
diff --git a/src/aztec/gpu/msm/util/thread_pool.hpp b/src/aztec/gpu/msm/util/thread_pool.hpp
deleted file mode 100644
index 52a15881..00000000
--- a/src/aztec/gpu/msm/util/thread_pool.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-#ifndef __THREAD_POOL_T_HPP__
-#define __THREAD_POOL_T_HPP__
-
-#if __cplusplus < 201103L && !(defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
-# error C++11 or later is required.
-#endif
-
-#include <thread>
-#include <mutex>
-#include <condition_variable>
-#include <atomic>
-#include <vector>
-#include <deque>
-#include <functional>
-#include <iostream>
-#ifdef _GNU_SOURCE
-# include <sched.h>
-#endif
-
-using namespace std;
-
-class thread_pool_t {
-private:
-    std::vector<std::thread> threads;
-
-    std::mutex mtx;                     // Inter-thread synchronization
-    std::condition_variable cvar;
-    std::atomic<bool> done;
-
-    typedef std::function<void()> job_t;
-    std::deque<job_t> fifo;
-
-public:
-    thread_pool_t(unsigned int num_threads = 0) : done(false)
-    {
-        if (num_threads == 0) {
-            num_threads = std::thread::hardware_concurrency();
-#ifdef _GNU_SOURCE
-            cpu_set_t set;
-            if (sched_getaffinity(0, sizeof(set), &set) == 0) {
-                size_t i, n;
-                for (n = 0, i = num_threads; i--;)
-                    n += CPU_ISSET(i, &set);
-                if (n != 0)
-                    num_threads = n;
-            }
-#endif
-        }
-
-        threads.reserve(num_threads);
-
-        for (unsigned int i = 0; i < num_threads; i++)
-            threads.push_back(std::thread([this]() { while (execute()); }));
-    }
-
-    virtual ~thread_pool_t()
-    {
-        done = true;
-        cvar.notify_all();
-        for (auto& tid : threads)
-            tid.join();
-    }
-
-    size_t size() { return threads.size(); }
-
-    template<class Workable> void spawn(Workable work)
-    {
-        std::unique_lock<std::mutex> lock(mtx);
-        fifo.emplace_back(job_t(work));
-        cvar.notify_one();  // wake up a worker thread
-    }
-
-private:
-    bool execute()
-    {
-        job_t work;
-        {
-            std::unique_lock<std::mutex> lock(mtx);
-
-            while (!done && fifo.empty())
-                cvar.wait(lock);
-
-            if (done && fifo.empty())
-                return false;
-
-            work = fifo.front(); fifo.pop_front();
-        }
-        work();
-
-        return true;
-    }
-
-public:
-    // call work(size_t idx) with idx=[0..num_items) in parallel, e.g.
-    // pool.par_map(20, [&](size_t i) { std::cout << i << std::endl; });
-    template<class Workable>
-    void par_map(size_t num_items, Workable work, size_t max_workers = 0)
-    {
-        size_t num_workers = std::min(size(), num_items);
-        if (max_workers > 0)
-            num_workers = std::min(num_workers, max_workers);
-
-        std::atomic<size_t> counter(0);
-        std::atomic<size_t> done(num_workers);
-        std::mutex b_mtx;
-        std::condition_variable barrier;
-
-        while (num_workers--) {
-            spawn([&, num_items]() {
-                size_t idx;
-                while ((idx = counter.fetch_add(1, std::memory_order_relaxed))
-                            < num_items)
-                    work(idx);
-                if (--done == 0) {
-                    std::unique_lock<std::mutex> lock(b_mtx);
-                    barrier.notify_one();
-                }
-            });
-        }
-
-        std::unique_lock<std::mutex> lock(b_mtx);
-        barrier.wait(lock, [&] { return done == 0; });
-    }
-};
-
-template<class T> class channel_t {
-private:
-    std::deque<T> fifo;
-    std::mutex mtx;
-    std::condition_variable cvar;
-
-public:
-    void send(const T& msg)
-    {
-        std::unique_lock<std::mutex> lock(mtx);
-        fifo.push_back(msg);
-        cvar.notify_one();
-    }
-
-    T recv()
-    {
-        std::unique_lock<std::mutex> lock(mtx);
-        cvar.wait(lock, [&] { return !fifo.empty(); });
-        auto msg = fifo.front(); fifo.pop_front();
-        return msg;
-    }
-};
-
-template<typename T> class counter_t {
-    struct inner {
-        std::atomic<T> val;
-        std::atomic<size_t> ref_cnt;
-        inline inner(T v) { val = v, ref_cnt = 1; };
-    };
-    inner *ptr;
-public:
-    counter_t(T v=0) { ptr = new inner(v); }
-    counter_t(const counter_t& r)
-    {   (ptr = r.ptr)->ref_cnt.fetch_add(1, std::memory_order_relaxed);   }
-    ~counter_t()
-    {
-        if (ptr->ref_cnt.fetch_sub(1, std::memory_order_seq_cst) == 1)
-            delete ptr;
-    }
-    size_t ref_cnt() const  { return ptr->ref_cnt; }
-    T operator++(int) const { return ptr->val.fetch_add(1, std::memory_order_relaxed); }
-    T operator++() const    { return ptr->val++ + 1; }
-    T operator--(int) const { return ptr->val.fetch_sub(1, std::memory_order_relaxed); }
-    T operator--() const    { return ptr->val-- - 1; }
-};
-#endif  // __THREAD_POOL_T_HPP__
\ No newline at end of file
diff --git a/src/aztec/gpu/plonk/plonk.cu b/src/aztec/gpu/plonk/plonk.cu
index 94f83bc8..b494fd9c 100644
--- a/src/aztec/gpu/plonk/plonk.cu
+++ b/src/aztec/gpu/plonk/plonk.cu
@@ -40,4 +40,6 @@ int main(int, char**) {
     // Generate and verify proof
     plonk_proof proof = prover->construct_proof();
     verifier.verify_proof(proof);
+
+    cout << "verified proof!" << endl;
 }
\ No newline at end of file
diff --git a/src/aztec/gpu/plonk/queue_wrapper.cu b/src/aztec/gpu/plonk/queue_wrapper.cu
index a7b1c347..637225be 100644
--- a/src/aztec/gpu/plonk/queue_wrapper.cu
+++ b/src/aztec/gpu/plonk/queue_wrapper.cu
@@ -19,23 +19,23 @@ void QueueWrapper::process_queue() {
                     auto reference_string = std::make_shared<waffle::FileReferenceString>(NUM_POINTS, "../srs_db");
                     g1::affine_element* pointss = reference_string->get_monomials();
 
-                    cout << "points is: " << pointss[0].x.data[0] << endl;
-                    cout << "points is: " << pointss[0].x.data[1] << endl;
-                    cout << "points is: " << pointss[0].x.data[2] << endl;
-                    cout << "points is: " << pointss[0].x.data[3] << endl;
-                    cout << "points is: " << pointss[0].y.data[0] << endl;
-                    cout << "points is: " << pointss[0].y.data[1] << endl;
-                    cout << "points is: " << pointss[0].y.data[2] << endl;
-                    cout << "points is: " << pointss[0].y.data[3] << endl;
+                    // cout << "points is: " << pointss[0].x.data[0] << endl;
+                    // cout << "points is: " << pointss[0].x.data[1] << endl;
+                    // cout << "points is: " << pointss[0].x.data[2] << endl;
+                    // cout << "points is: " << pointss[0].x.data[3] << endl;
+                    // cout << "points is: " << pointss[0].y.data[0] << endl;
+                    // cout << "points is: " << pointss[0].y.data[1] << endl;
+                    // cout << "points is: " << pointss[0].y.data[2] << endl;
+                    // cout << "points is: " << pointss[0].y.data[3] << endl;
 
-                    cout << "points is: " << pointss[1].x.data[0] << endl;
-                    cout << "points is: " << pointss[1].x.data[1] << endl;
-                    cout << "points is: " << pointss[1].x.data[2] << endl;
-                    cout << "points is: " << pointss[1].x.data[3] << endl;
-                    cout << "points is: " << pointss[1].y.data[0] << endl;
-                    cout << "points is: " << pointss[1].y.data[1] << endl;
-                    cout << "points is: " << pointss[1].y.data[2] << endl;
-                    cout << "points is: " << pointss[1].y.data[3] << endl;
+                    // cout << "points is: " << pointss[1].x.data[0] << endl;
+                    // cout << "points is: " << pointss[1].x.data[1] << endl;
+                    // cout << "points is: " << pointss[1].x.data[2] << endl;
+                    // cout << "points is: " << pointss[1].x.data[3] << endl;
+                    // cout << "points is: " << pointss[1].y.data[0] << endl;
+                    // cout << "points is: " << pointss[1].y.data[1] << endl;
+                    // cout << "points is: " << pointss[1].y.data[2] << endl;
+                    // cout << "points is: " << pointss[1].y.data[3] << endl;
 
                     // Naive implementation for quick testing
                     // msm_t<g1_gpu::affine_element, scalar_t, point_t> *msm = new msm_t<g1_gpu::affine_element, fr_gpu, point_t>();
@@ -45,7 +45,7 @@ void QueueWrapper::process_queue() {
                     // Context<bucket_t, point_t, scalar_t, affine_t> *context = msm->pippenger_initialize(points);
 
                     // g1_gpu::element *final_result_2 = msm->msm_bucket_method(context, item.mul_scalars, key->reference_string->get_monomials(), 1 << 10);
-                    exit(0);
+                    // exit(0);
 
 
                     auto runtime_state = barretenberg::scalar_multiplication::pippenger_runtime_state(key->small_domain.size + 1);
diff --git a/src/aztec/gpu/utils/cuda_utils.cuh b/src/aztec/gpu/utils/cuda_utils.cuh
new file mode 100644
index 00000000..071e8d89
--- /dev/null
+++ b/src/aztec/gpu/utils/cuda_utils.cuh
@@ -0,0 +1,32 @@
+#pragma once
+#include <cuda_runtime.h>
+
+struct cuda_ctx {
+  int device_id;
+  cudaMemPool_t mempool;
+  cudaStream_t stream;
+
+  cuda_ctx(int gpu_id)
+  {
+    gpu_id = gpu_id;
+    cudaMemPoolProps pool_props;
+    pool_props.allocType = cudaMemAllocationTypePinned;
+    pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
+    pool_props.location.type = cudaMemLocationTypeDevice;
+    pool_props.location.id = device_id;
+
+    cudaMemPoolCreate(&mempool, &pool_props);
+    cudaStreamCreate(&stream);
+  }
+
+  void set_device() { cudaSetDevice(device_id); }
+
+  void sync_stream() { cudaStreamSynchronize(stream); }
+
+  void malloc(void* ptr, size_t bytesize) { cudaMallocFromPoolAsync(&ptr, bytesize, mempool, stream); }
+
+  void free(void* ptr) { cudaFreeAsync(ptr, stream); }
+};
+
+// -- Proposed Function Tops --------------------------------------------------
+// ----------------------------------------------------------------------------
diff --git a/src/aztec/gpu/utils/host_math.cuh b/src/aztec/gpu/utils/host_math.cuh
new file mode 100644
index 00000000..73922b92
--- /dev/null
+++ b/src/aztec/gpu/utils/host_math.cuh
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+namespace host_math {
+
+  // return x + y with uint32_t operands
+  static __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
+
+  // return x + y + carry with uint32_t operands
+  static __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }
+
+  // return x + y and carry out with uint32_t operands
+  static __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+  {
+    uint32_t result;
+    result = x + y;
+    carry = x > result;
+    return result;
+  }
+
+  // return x + y + carry and carry out  with uint32_t operands
+  static __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+  {
+    const uint32_t result = x + y + carry;
+    carry = carry && x >= result || !carry && x > result;
+    return result;
+  }
+
+  // return x - y with uint32_t operands
+  static __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
+
+  // 	return x - y - borrow with uint32_t operands
+  static __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
+
+  //	return x - y and borrow out with uint32_t operands
+  static __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+  {
+    uint32_t result;
+    result = x - y;
+    borrow = x < result;
+    return result;
+  }
+
+  //	return x - y - borrow and borrow out with uint32_t operands
+  static __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+  {
+    const uint32_t result = x - y - borrow;
+    borrow = borrow && x <= result || !borrow && x < result;
+    return result;
+  }
+
+  // return x * y + z + carry and carry out with uint32_t operands
+  static __host__ uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
+  {
+    uint32_t result;
+    uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
+    carry = r >> 32;
+    result = r & 0xffffffff;
+    return result;
+  }
+
+  template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
+  struct carry_chain {
+    unsigned index;
+
+    constexpr __host__ __forceinline__ carry_chain() : index(0) {}
+
+    __host__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
+    {
+      index++;
+      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
+        return host_math::add(x, y);
+      else if (index == 1 && !CARRY_IN)
+        return host_math::add_cc(x, y, carry);
+      else if (index < OPS_COUNT || CARRY_OUT)
+        return host_math::addc_cc(x, y, carry);
+      else
+        return host_math::addc(x, y, carry);
+    }
+
+    __host__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
+    {
+      index++;
+      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
+        return host_math::sub(x, y);
+      else if (index == 1 && !CARRY_IN)
+        return host_math::sub_cc(x, y, carry);
+      else if (index < OPS_COUNT || CARRY_OUT)
+        return host_math::subc_cc(x, y, carry);
+      else
+        return host_math::subc(x, y, carry);
+    }
+  };
+} // namespace host_math
diff --git a/src/aztec/gpu/utils/mont.cuh b/src/aztec/gpu/utils/mont.cuh
new file mode 100644
index 00000000..de90b604
--- /dev/null
+++ b/src/aztec/gpu/utils/mont.cuh
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "../headers/ve_mod_mult.cuh"
+
+template <typename E>
+int convert_montgomery(E* d_inout, size_t n_elments, bool is_into, cudaStream_t stream)
+{
+  // Set the grid and block dimensions
+  int num_threads = MAX_THREADS_PER_BLOCK;
+  int num_blocks = (n_elments + num_threads - 1) / num_threads;
+  E mont = is_into ? E::montgomery_r() : E::montgomery_r_inv();
+  template_normalize_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, n_elments, mont);
+
+  return 0; // TODO: void with propper error handling
+}
+
+template <typename E>
+int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream)
+{
+  return convert_montgomery(d_inout, n, true, stream);
+}
+
+template <typename E>
+int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream)
+{
+  return convert_montgomery(d_inout, n, false, stream);
+}
\ No newline at end of file
diff --git a/src/aztec/gpu/utils/objects.cuh b/src/aztec/gpu/utils/objects.cuh
new file mode 100644
index 00000000..f3a5a655
--- /dev/null
+++ b/src/aztec/gpu/utils/objects.cuh
@@ -0,0 +1,63 @@
+#pragma once
+template <class F>
+class Element
+{
+public:
+  int v;
+  __device__ __host__ Element<F>() { v = 0; }
+  __device__ __host__ Element<F>(int r)
+  {
+    v = r % F::q;
+    if (r == F::q) v = F::q;
+  }
+  __device__ __host__ Element<F> operator+(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v + obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Element<F> operator-(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v - obj.v) % F::q;
+    if (res.v < 0) { res.v = F::q + res.v; }
+    return res;
+  }
+};
+
+template <class F>
+class Scalar
+{
+public:
+  int v;
+  __device__ __host__ Scalar<F>() { v = 0; }
+  __device__ __host__ Scalar<F>(int r) { v = r % F::q; }
+  __device__ __host__ Scalar<F> operator+(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v + obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Scalar<F> operator*(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v * obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Element<F> operator*(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v * obj.v) % F::q;
+    return res;
+  }
+  Scalar<F> operator-(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v - obj.v) % F::q;
+    if (res.v < 0) { res.v = F::q + res.v; }
+    return res;
+  }
+  bool operator<(Scalar<F> const& obj) { return v < obj.v; }
+  static Scalar<F> one() { return Scalar<F>(1); }
+  static Scalar<F> zero() { return Scalar<F>(0); }
+};
\ No newline at end of file
diff --git a/src/aztec/gpu/utils/ptx.cuh b/src/aztec/gpu/utils/ptx.cuh
new file mode 100644
index 00000000..7625bd92
--- /dev/null
+++ b/src/aztec/gpu/utils/ptx.cuh
@@ -0,0 +1,282 @@
+#pragma once
+#include <cstdint>
+#include <cuda_runtime.h>
+
+namespace ptx {
+
+  __device__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm("add.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t add_cc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t addc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("addc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t addc_cc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm("sub.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t sub_cc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("sub.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t subc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("subc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t subc_cc(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm volatile("subc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t mul_lo(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t mul_hi(const uint32_t x, const uint32_t y)
+  {
+    uint32_t result;
+    asm("mul.hi.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm("mad.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm("mad.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("madc.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }
+
+  __device__ __forceinline__ uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  {
+    uint32_t result;
+    asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
+    return result;
+  }
+
+  __device__ __forceinline__ uint64_t mov_b64(uint32_t lo, uint32_t hi)
+  {
+    uint64_t result;
+    asm("mov.b64 %0, {%1,%2};" : "=l"(result) : "r"(lo), "r"(hi));
+    return result;
+  }
+
+  // Gives u64 overloads a dedicated namespace.
+  // Callers should know exactly what they're calling (no implicit conversions).
+  namespace u64 {
+
+    __device__ __forceinline__ uint64_t add(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm("add.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t add_cc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t addc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("addc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t addc_cc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t sub(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm("sub.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t sub_cc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t subc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("subc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t subc_cc(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t mul_lo(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm("mul.lo.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t mul_hi(const uint64_t x, const uint64_t y)
+    {
+      uint64_t result;
+      asm("mul.hi.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm("mad.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm("mad.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("mad.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("mad.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("madc.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("madc.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("madc.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }
+
+    __device__ __forceinline__ uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    {
+      uint64_t result;
+      asm volatile("madc.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
+      return result;
+    }
+
+  } // namespace u64
+
+  __device__ __forceinline__ void bar_arrive(const unsigned name, const unsigned count)
+  {
+    asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(count) : "memory");
+  }
+
+  __device__ __forceinline__ void bar_sync(const unsigned name, const unsigned count)
+  {
+    asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(count) : "memory");
+  }
+
+} // namespace ptx
\ No newline at end of file
diff --git a/src/aztec/gpu/utils/sharedmem.cuh b/src/aztec/gpu/utils/sharedmem.cuh
new file mode 100644
index 00000000..9c2639b2
--- /dev/null
+++ b/src/aztec/gpu/utils/sharedmem.cuh
@@ -0,0 +1,278 @@
+// based on https://leimao.github.io/blog/CUDA-Shared-Memory-Templated-Kernel/
+// may be outdated, but only worked like that
+
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision: 5636 $
+// $Date: 2009-07-02 13:39:38 +1000 (Thu, 02 Jul 2009) $
+// -------------------------------------------------------------
+// This source code is distributed under the terms of license.txt
+// in the root directory of this source distribution.
+// -------------------------------------------------------------
+
+/**
+ * @file
+ * sharedmem.h
+ *
+ * @brief Shared memory declaration struct for templatized types.
+ *
+ * Because dynamically sized shared memory arrays are declared "extern" in CUDA,
+ * we can't templatize their types directly.  To get around this, we declare a
+ * simple wrapper struct that will declare the extern array with a different
+ * name depending on the type.  This avoids linker errors about multiple
+ * definitions.
+ *
+ * To use dynamically allocated shared memory in a templatized __global__ or
+ * __device__ function, just replace code like this:
+ *
+ * <pre>
+ *  template<class T>
+ *  __global__ void
+ *  foo( T* d_out, T* d_in)
+ *  {
+ *      // Shared mem size is determined by the host app at run time
+ *      extern __shared__  T sdata[];
+ *      ...
+ *      doStuff(sdata);
+ *      ...
+ *  }
+ * </pre>
+ *
+ *  With this
+ * <pre>
+ *  template<class T>
+ *  __global__ void
+ *  foo( T* d_out, T* d_in)
+ *  {
+ *      // Shared mem size is determined by the host app at run time
+ *      SharedMemory<T> smem;
+ *      T* sdata = smem.getPointer();
+ *      ...
+ *      doStuff(sdata);
+ *      ...
+ *  }
+ * </pre>
+ */
+
+#ifndef _SHAREDMEM_H_
+#define _SHAREDMEM_H_
+
+#include "../headers/curve_config.cuh"
+// #include "../curves/bls12_381/curve_config.cuh"
+// #include "../curves/bn254/curve_config.cuh"
+
+/** @brief Wrapper class for templatized dynamic shared memory arrays.
+ *
+ * This struct uses template specialization on the type \a T to declare
+ * a differently named dynamic shared memory array for each type
+ * (\code extern __shared__ T s_type[] \endcode).
+ *
+ * Currently there are specializations for the following types:
+ * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
+ * \c unsigned long, \c bool, \c float, and \c double. One can also specialize it
+ * for user defined types.
+ */
+template <typename T>
+struct SharedMemory {
+  //! @brief Return a pointer to the runtime-sized shared memory array.
+  //! @returns Pointer to runtime-sized shared memory array
+  __device__ T* getPointer()
+  {
+    extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
+    Error_UnsupportedType();
+    return (T*)0;
+  }
+  // TODO: Use operator overloading to make this class look like a regular array
+};
+
+// Following are the specializations for the following types.
+// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
+// One could also specialize it for user-defined types.
+
+template <>
+struct SharedMemory<int> {
+  __device__ int* getPointer()
+  {
+    extern __shared__ int s_int[];
+    return s_int;
+  }
+};
+
+template <>
+struct SharedMemory<unsigned int> {
+  __device__ unsigned int* getPointer()
+  {
+    extern __shared__ unsigned int s_uint[];
+    return s_uint;
+  }
+};
+
+template <>
+struct SharedMemory<char> {
+  __device__ char* getPointer()
+  {
+    extern __shared__ char s_char[];
+    return s_char;
+  }
+};
+
+template <>
+struct SharedMemory<unsigned char> {
+  __device__ unsigned char* getPointer()
+  {
+    extern __shared__ unsigned char s_uchar[];
+    return s_uchar;
+  }
+};
+
+template <>
+struct SharedMemory<short> {
+  __device__ short* getPointer()
+  {
+    extern __shared__ short s_short[];
+    return s_short;
+  }
+};
+
+template <>
+struct SharedMemory<unsigned short> {
+  __device__ unsigned short* getPointer()
+  {
+    extern __shared__ unsigned short s_ushort[];
+    return s_ushort;
+  }
+};
+
+template <>
+struct SharedMemory<long> {
+  __device__ long* getPointer()
+  {
+    extern __shared__ long s_long[];
+    return s_long;
+  }
+};
+
+template <>
+struct SharedMemory<unsigned long> {
+  __device__ unsigned long* getPointer()
+  {
+    extern __shared__ unsigned long s_ulong[];
+    return s_ulong;
+  }
+};
+
+template <>
+struct SharedMemory<long long> {
+  __device__ long long* getPointer()
+  {
+    extern __shared__ long long s_longlong[];
+    return s_longlong;
+  }
+};
+
+template <>
+struct SharedMemory<unsigned long long> {
+  __device__ unsigned long long* getPointer()
+  {
+    extern __shared__ unsigned long long s_ulonglong[];
+    return s_ulonglong;
+  }
+};
+
+template <>
+struct SharedMemory<bool> {
+  __device__ bool* getPointer()
+  {
+    extern __shared__ bool s_bool[];
+    return s_bool;
+  }
+};
+
+template <>
+struct SharedMemory<float> {
+  __device__ float* getPointer()
+  {
+    extern __shared__ float s_float[];
+    return s_float;
+  }
+};
+
+template <>
+struct SharedMemory<double> {
+  __device__ double* getPointer()
+  {
+    extern __shared__ double s_double[];
+    return s_double;
+  }
+};
+
+template <>
+struct SharedMemory<uchar4> {
+  __device__ uchar4* getPointer()
+  {
+    extern __shared__ uchar4 s_uchar4[];
+    return s_uchar4;
+  }
+};
+
+// template <>
+// struct SharedMemory<BLS12_381::scalar_t> {
+//   __device__ BLS12_381::scalar_t* getPointer()
+//   {
+//     extern __shared__ BLS12_381::scalar_t s_scalar_t_bls12_381[];
+//     return s_scalar_t_bls12_381;
+//   }
+// };
+
+// template <>
+// struct SharedMemory<BLS12_381::projective_t> {
+//   __device__ BLS12_381::projective_t* getPointer()
+//   {
+//     extern __shared__ BLS12_381::projective_t s_projective_t_bls12_381[];
+//     return s_projective_t_bls12_381;
+//   }
+// };
+
+// template <>
+// struct SharedMemory<BLS12_377::scalar_t> {
+//   __device__ BLS12_377::scalar_t* getPointer()
+//   {
+//     extern __shared__ BLS12_377::scalar_t s_scalar_t_bls12_377[];
+//     return s_scalar_t_bls12_377;
+//   }
+// };
+
+// template <>
+// struct SharedMemory<BLS12_377::projective_t> {
+//   __device__ BLS12_377::projective_t* getPointer()
+//   {
+//     extern __shared__ BLS12_377::projective_t s_projective_t_bls12_377[];
+//     return s_projective_t_bls12_377;
+//   }
+// };
+
+template <>
+struct SharedMemory<BN254::scalar_t> {
+  __device__ BN254::scalar_t* getPointer()
+  {
+    extern __shared__ BN254::scalar_t s_scalar_t_bn254[];
+    return s_scalar_t_bn254;
+  }
+};
+
+template <>
+struct SharedMemory<BN254::projective_t> {
+  __device__ BN254::projective_t* getPointer()
+  {
+    extern __shared__ BN254::projective_t s_projective_t_bn254[];
+    return s_projective_t_bn254;
+  }
+};
+#endif //_SHAREDMEM_H_
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
\ No newline at end of file
diff --git a/src/aztec/gpu/utils/storage.cuh b/src/aztec/gpu/utils/storage.cuh
new file mode 100644
index 00000000..c4a56c49
--- /dev/null
+++ b/src/aztec/gpu/utils/storage.cuh
@@ -0,0 +1,17 @@
+#pragma once
+#include <cstdint>
+
+#define LIMBS_ALIGNMENT(x) ((x) % 4 == 0 ? 16 : ((x) % 2 == 0 ? 8 : 4))
+
+template <unsigned LIMBS_COUNT>
+struct __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) storage
+{
+  static constexpr unsigned LC = LIMBS_COUNT;
+  uint32_t limbs[LIMBS_COUNT];
+};
+
+template <unsigned OMEGAS_COUNT, unsigned LIMBS_COUNT>
+struct __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) storage_array
+{
+  storage<LIMBS_COUNT> storages[OMEGAS_COUNT];
+};
\ No newline at end of file