From 85f10024e56dedae2ea35ae4fb0e380b1e4a014a Mon Sep 17 00:00:00 2001
From: jamesETsmith <james.smith9113@gmail.com>
Date: Fri, 1 Nov 2024 15:57:27 -0400
Subject: [PATCH] [feature/SPO_apply] First stab at C++ impl of optimized
 squaring of SummedPauliOp

---
 fast_pauli/cpp/include/__summed_pauli_op.hpp  | 78 +++++++++++++++++++
 fast_pauli/cpp/src/fast_pauli.cpp             |  8 ++
 fast_pauli/cpp/tests/test_summed_pauli_op.cpp | 61 +++++++++++++++
 3 files changed, 147 insertions(+)
diff --git a/fast_pauli/cpp/include/__summed_pauli_op.hpp b/fast_pauli/cpp/include/__summed_pauli_op.hpp
index bb3230c..49a32ea 100644
--- a/fast_pauli/cpp/include/__summed_pauli_op.hpp
+++ b/fast_pauli/cpp/include/__summed_pauli_op.hpp
@@ -20,6 +20,7 @@
 
 #include <cstddef>
 #include <stdexcept>
+#include <unordered_map>
 
 #include "__pauli_string.hpp"
 #include "__type_traits.hpp"
@@ -45,6 +46,7 @@ template <std::floating_point T> struct SummedPauliOp
     // TODO dangerous
     size_t _dim;
     size_t _n_operators;
+    size_t _n_qubits;
 
     void __check_ctor_inputs(std::vector<fast_pauli::PauliString> const &pauli_strings, Tensor<2> const coeffs)
     {
@@ -88,6 +90,7 @@ template <std::floating_point T> struct SummedPauliOp
         // TODO add more checks
         size_t const n_pauli_strings = pauli_strings.size();
         _dim = pauli_strings[0].dim();
+        _n_qubits = pauli_strings[0].n_qubits();
         _n_operators = coeffs_raw.size() / n_pauli_strings;
         coeffs = Tensor<2>(this->coeffs_raw.data(), n_pauli_strings, _n_operators);
 
@@ -111,6 +114,7 @@ template <std::floating_point T> struct SummedPauliOp
 
         //
         _dim = pauli_strings[0].dim();
+        _n_qubits = pauli_strings[0].n_qubits();
         _n_operators = coeffs.extent(1);
 
         // Copy over the coeffs so our std::mdspan points at the memory owned by
@@ -145,6 +149,7 @@ template <std::floating_point T> struct SummedPauliOp
 
         //
         _dim = this->pauli_strings.front().dim();
+        _n_qubits = this->pauli_strings.front().n_qubits();
         _n_operators = coeffs.extent(1);
 
         // Copy over the coeffs so our std::mdspan points at the memory owned by
@@ -168,6 +173,16 @@ template <std::floating_point T> struct SummedPauliOp
         return _dim;
     }
 
+    /**
+     * @brief Return the number of qubits in the SummedPauliOp
+     *
+     * @return size_t
+     */
+    size_t n_qubits() const noexcept
+    {
+        return _n_qubits;
+    }
+
     /**
      * @brief Return the number of Pauli operators in the SummedPauliOp
      *
@@ -188,6 +203,69 @@ template <std::floating_point T> struct SummedPauliOp
         return pauli_strings.size();
     }
 
+    fast_pauli::SummedPauliOp<T> square() const
+    {
+        // Get the squared pauli strings
+        size_t weight = std::reduce(
+            pauli_strings.begin(), pauli_strings.end(), size_t(0),
+            [](size_t a, fast_pauli::PauliString const &ps) { return std::max(a, static_cast<size_t>(ps.weight)); });
+        fmt::println("weight: {}", weight);
+        std::vector<PauliString> pauli_strings_sq =
+            fast_pauli::calculate_pauli_strings_max_weight(_n_qubits, std::min(_n_qubits, 2UL * weight));
+
+        std::unordered_map<fast_pauli::PauliString, size_t> sq_idx_map;
+        for (size_t i = 0; i < pauli_strings_sq.size(); ++i)
+        {
+            sq_idx_map[pauli_strings_sq[i]] = i;
+        }
+
+        // Create the T_aij tensor that maps the coeffiencts from h_i * h_j to h'_a
+        // these are the same for all k operators
+        std::vector<std::complex<T>> t_aij_raw(pauli_strings_sq.size() * pauli_strings.size() * pauli_strings.size());
+        Tensor<3> t_aij(t_aij_raw.data(), pauli_strings_sq.size(), pauli_strings.size(), pauli_strings.size());
+
+        // #pragma omp parallel for collapse(2)
+        for (size_t i = 0; i < pauli_strings.size(); ++i)
+        {
+            for (size_t j = 0; j < pauli_strings.size(); ++j)
+            {
+                std::complex<T> phase;
+                fast_pauli::PauliString prod;
+                std::tie(phase, prod) = pauli_strings[i] * pauli_strings[j];
+                size_t a = sq_idx_map[prod];
+                t_aij(a, i, j) = phase;
+            }
+        }
+
+        // Contract the T_aij tensor with the coeffs to get the new coeffs
+        std::vector<std::complex<T>> coeffs_sq_raw(pauli_strings_sq.size() * _n_operators);
+        Tensor<2> coeffs_sq(coeffs_sq_raw.data(), pauli_strings_sq.size(), _n_operators);
+
+        // We want to do the following einsum:
+        // coeffs_sq(a, k) = sum_{i,j} t_aij(a, i, j) * coeffs(i, k) * coeffs(j, k)
+        // If we break it up into two steps, the intermediates will be too big, so we're
+        // just going to do it all at once
+
+        // #pragma omp parallel for collapse(2)
+        fmt::println("{} {} {} {}", pauli_strings_sq.size(), _n_operators, pauli_strings.size(), pauli_strings.size());
+        for (size_t a = 0; a < pauli_strings_sq.size(); ++a)
+        {
+            for (size_t k = 0; k < _n_operators; ++k)
+            {
+                for (size_t i = 0; i < pauli_strings.size(); ++i)
+                {
+                    for (size_t j = 0; j < pauli_strings.size(); ++j)
+                    {
+                        // fmt::println("{} {} {} {}", a, k, i, j);
+                        coeffs_sq(a, k) += t_aij(a, i, j) * coeffs(i, k) * coeffs(j, k);
+                    }
+                }
+            }
+        }
+
+        return SummedPauliOp<T>(pauli_strings_sq, coeffs_sq);
+    }
+
     /**
      * @brief Apply the SummedPauliOp to a set of states, mathematically
      * \f$ \big(\sum_k \sum_i h_{ik} \mathcal{\hat{P}}_i \big) \ket{\psi_t} \f$
diff --git a/fast_pauli/cpp/src/fast_pauli.cpp b/fast_pauli/cpp/src/fast_pauli.cpp
index 7b02abc..f5a82d0 100644
--- a/fast_pauli/cpp/src/fast_pauli.cpp
+++ b/fast_pauli/cpp/src/fast_pauli.cpp
@@ -1119,6 +1119,14 @@ Returns
 -------
 np.ndarray
     3D numpy array of complex numbers with a shape of (n_operators, 2^n_qubits, 2^n_qubits)
+)%")
+        //
+        .def("square", &fp::SummedPauliOp<float_type>::square, R"%(Square the SummedPauliOp.
+
+Returns
+-------
+SummedPauliOp
+    New SummedPauliOp instance
 )%");
     //
     ;
diff --git a/fast_pauli/cpp/tests/test_summed_pauli_op.cpp b/fast_pauli/cpp/tests/test_summed_pauli_op.cpp
index d34f089..c354334 100644
--- a/fast_pauli/cpp/tests/test_summed_pauli_op.cpp
+++ b/fast_pauli/cpp/tests/test_summed_pauli_op.cpp
@@ -407,3 +407,64 @@ TEST_CASE("expectation values multiple operators and states")
     elapsed = end - start;
     fmt::println("Time taken for parallel execution: {} seconds", elapsed.count());
 }
+
+void __check_square(size_t const n_qubits, size_t const n_operators)
+{
+    // Check that the square function works by use op.to_tensor() to create a dense tensor and then squaring the
+    // operators manually
+    size_t const dim = 1UL << n_qubits;
+    std::vector<PauliString> pauli_strings = fast_pauli::calculate_pauli_strings_max_weight(n_qubits, 2);
+    std::vector<std::complex<double>> coeff_raw;
+    std::mdspan coeff = fast_pauli::rand<std::complex<double>, 2>(coeff_raw, {pauli_strings.size(), n_operators});
+    SummedPauliOp<double> op(pauli_strings, coeff);
+
+    fmt::println("Squaring operator");
+    SummedPauliOp<double> op2 = op.square();
+
+    fmt::println("Converting to dense tensor");
+    std::vector<std::complex<double>> op1_dense_raw(n_operators * dim * dim);
+    std::mdspan op1_dense(op1_dense_raw.data(), n_operators, dim, dim);
+    op.to_tensor(op1_dense);
+
+    std::vector<std::complex<double>> op2_dense_raw(n_operators * dim * dim);
+    std::mdspan op2_dense(op2_dense_raw.data(), n_operators, dim, dim);
+    op2.to_tensor(op2_dense);
+
+    std::vector<std::complex<double>> op1_sq_raw(n_operators * dim * dim);
+    std::mdspan op1_sq(op1_sq_raw.data(), n_operators, dim, dim);
+
+    fmt::println("Squaring dense tensor");
+    // Doing the einsum
+    // op1_sq(k,a,c) = sum_b op1_dense(k,a,b) * op1_dense(k,b,c)
+#pragma omp parallel for collapse(3)
+    for (size_t k = 0; k < n_operators; ++k)
+    {
+        for (size_t i = 0; i < dim; ++i)
+        {
+            for (size_t j = 0; j < dim; ++j)
+            {
+                for (size_t b = 0; b < dim; ++b)
+                {
+                    op1_sq(k, i, j) += op1_dense(k, i, b) * op1_dense(k, b, j);
+                }
+            }
+        }
+    }
+
+    fmt::println("Checking results");
+    // Check the results
+    for (size_t k = 0; k < n_operators; ++k)
+    {
+        for (size_t i = 0; i < dim; ++i)
+        {
+            for (size_t j = 0; j < dim; ++j)
+            {
+                CHECK(abs(op1_sq(k, i, j) - op2_dense(k, i, j)) < 1e-6);
+            }
+        }
+    }
+}
+TEST_CASE("square")
+{
+    __check_square(4, 10);
+}