diff --git a/fast_pauli/cpp/examples/03_summed_pauli_op.cpp b/fast_pauli/cpp/examples/03_summed_pauli_op.cpp
index 2885e65..32a0bb6 100644
--- a/fast_pauli/cpp/examples/03_summed_pauli_op.cpp
+++ b/fast_pauli/cpp/examples/03_summed_pauli_op.cpp
@@ -28,8 +28,7 @@ int main()
     // User settings
     //
     size_t const n_operators = 1000;
-    //   size_t const n_paulis_per_operator = 631;
-    size_t const n_qubits = 14;
+    size_t const n_qubits = 12;
     size_t const weight = 2;
     size_t const n_states = 1000;
     using fp_type = double;
diff --git a/fast_pauli/cpp/include/__summed_pauli_op.hpp b/fast_pauli/cpp/include/__summed_pauli_op.hpp
index 011ef10..bb3230c 100644
--- a/fast_pauli/cpp/include/__summed_pauli_op.hpp
+++ b/fast_pauli/cpp/include/__summed_pauli_op.hpp
@@ -336,8 +336,9 @@ template <std::floating_point T> struct SummedPauliOp
 
 #pragma omp parallel
             {
+
                 // Contract the coeffs with the data since we can reuse this below
-#pragma omp for schedule(static) collapse(2)
+#pragma omp for schedule(dynamic) collapse(2)
                 for (size_t j = 0; j < n_ps; ++j)
                 {
                     for (size_t t = 0; t < n_data; ++t)
@@ -345,6 +346,7 @@ template <std::floating_point T> struct SummedPauliOp
                         for (size_t k = 0; k < n_ops; ++k)
                         {
                             // TODO we should transpose the data here for better memory access
+                            // initial tests didn't show much improvement, but something to try later
                             weighted_coeffs(j, t) += coeffs(j, k) * data(k, t);
                         }
                     }
@@ -354,13 +356,10 @@ template <std::floating_point T> struct SummedPauliOp
                 std::vector<std::complex<T>> new_states_j_raw(n_data * n_dim);
                 Tensor<2> new_states_j(new_states_j_raw.data(), n_dim, n_data);
 
-                // std::vector<std::complex<T>> states_j_T_raw(n_data * n_dim);
-                // Tensor<2> states_j_T(states_j_T_raw.data(), n_data, n_dim);
-
                 std::mdspan new_states_th_local =
                     std::submdspan(new_states_th, omp_get_thread_num(), std::full_extent, std::full_extent);
 
-#pragma omp for schedule(static)
+#pragma omp for schedule(dynamic)
                 for (size_t j = 0; j < n_ps; ++j)
                 {
                     // new psi_prime
@@ -375,9 +374,8 @@ template <std::floating_point T> struct SummedPauliOp
                         }
                     }
                 }
-
-// Reduce
-#pragma omp for schedule(static) collapse(2)
+                // Reduce
+#pragma omp for schedule(dynamic) collapse(2)
                 for (size_t l = 0; l < n_dim; ++l)
                 {
                     for (size_t t = 0; t < n_data; ++t)
@@ -490,22 +488,37 @@ template <std::floating_point T> struct SummedPauliOp
         if constexpr (is_parallel_execution_policy_v<ExecutionPolicy>)
         {
 
-#pragma omp parallel for schedule(static)
-            for (size_t j = 0; j < n_pauli_strings(); ++j)
+#pragma omp parallel
             {
-                std::mdspan expectation_vals_j = std::submdspan(expectation_vals, j, std::full_extent);
-                pauli_strings[j].expectation_value(expectation_vals_j, states);
-            }
+#pragma omp for schedule(static)
+                for (size_t j = 0; j < n_pauli_strings(); ++j)
+                {
+                    std::mdspan expectation_vals_j = std::submdspan(expectation_vals, j, std::full_extent);
+                    pauli_strings[j].expectation_value(expectation_vals_j, states);
+                }
 
-            // einsum("jk,jt->kt", coeffs, expectation_vals)
-#pragma omp parallel for collapse(2)
-            for (size_t t = 0; t < n_data; ++t)
-            {
+                // einsum("jk,jt->kt", coeffs, expectation_vals)
+                std::vector<std::complex<T>> coeffs_T_raw(n_ops * n_pauli_strings());
+                std::mdspan coeffs_T(coeffs_T_raw.data(), n_ops, n_pauli_strings());
+
+#pragma omp for schedule(static) collapse(2)
                 for (size_t k = 0; k < n_ops; ++k)
                 {
                     for (size_t j = 0; j < n_pauli_strings(); ++j)
                     {
-                        expectation_vals_out(k, t) += coeffs(j, k) * expectation_vals(j, t);
+                        coeffs_T(k, j) = coeffs(j, k);
+                    }
+                }
+
+#pragma omp for collapse(2) schedule(static)
+                for (size_t k = 0; k < n_ops; ++k)
+                {
+                    for (size_t t = 0; t < n_data; ++t)
+                    {
+                        for (size_t j = 0; j < n_pauli_strings(); ++j)
+                        {
+                            expectation_vals_out(k, t) += coeffs(j, k) * expectation_vals(j, t);
+                        }
                     }
                 }
             }
diff --git a/fast_pauli/cpp/src/fast_pauli.cpp b/fast_pauli/cpp/src/fast_pauli.cpp
index dd27bbf..7b02abc 100644
--- a/fast_pauli/cpp/src/fast_pauli.cpp
+++ b/fast_pauli/cpp/src/fast_pauli.cpp
@@ -1068,7 +1068,7 @@ np.ndarray
                 {
                     auto states_mdspan = fp::__detail::ndarray_to_mdspan<cfloat_t, 1>(states);
                     auto states_mdspan_2d = std::mdspan(states_mdspan.data_handle(), states_mdspan.size(), 1);
-                    std::array<size_t, 1> out_shape = {1};
+                    std::array<size_t, 1> out_shape = {self.n_operators()};
                     auto expected_vals_out = fp::__detail::owning_ndarray_from_shape<cfloat_t, 1>(out_shape);
                     auto expected_vals_out_mdspan = std::mdspan(expected_vals_out.data(), expected_vals_out.size(), 1);
                     self.expectation_value(std::execution::par, expected_vals_out_mdspan, states_mdspan_2d);
diff --git a/fast_pauli/cpp/tests/test_summed_pauli_op.cpp b/fast_pauli/cpp/tests/test_summed_pauli_op.cpp
index a8c9670..df4a82f 100644
--- a/fast_pauli/cpp/tests/test_summed_pauli_op.cpp
+++ b/fast_pauli/cpp/tests/test_summed_pauli_op.cpp
@@ -287,10 +287,10 @@ TEST_CASE("apply weighted many operators many PauliString")
 {
     fmt::print("\n\napply many operators many PauliString\n");
     // Setup operator
-    std::vector<PauliString> pauli_strings = fast_pauli::calculate_pauli_strings_max_weight(8, 2);
+    std::vector<PauliString> pauli_strings = fast_pauli::calculate_pauli_strings_max_weight(6, 2);
 
     std::vector<std::complex<double>> coeff_raw;
-    std::mdspan coeff = fast_pauli::rand<std::complex<double>, 2>(coeff_raw, {pauli_strings.size(), 100});
+    std::mdspan coeff = fast_pauli::rand<std::complex<double>, 2>(coeff_raw, {pauli_strings.size(), 1000});
 
     auto start_seq = std::chrono::high_resolution_clock::now();
     __check_apply_weighted(std::execution::seq, pauli_strings, coeff, 100);
diff --git a/tests/fast_pauli/test_summed_pauli_op.py b/tests/fast_pauli/test_summed_pauli_op.py
index 8faed20..a1a0981 100644
--- a/tests/fast_pauli/test_summed_pauli_op.py
+++ b/tests/fast_pauli/test_summed_pauli_op.py
@@ -150,28 +150,29 @@ def test_expectation_values(
     n_strings = len(pauli_strings)
 
     coeffs_2d = np.random.rand(n_strings, n_operators).astype(np.complex128)
-    psi = np.random.rand(2**n_qubits, n_states).astype(np.complex128)
-
-    # For a single state, test that a 1D array works
-    if n_states == 1:
-        psi = psi[:, 0]
-
-    op = summed_pauli_op(pauli_strings, coeffs_2d)
-
-    # The expectation values we want to test
-    expectation_vals = op.expectation_value(psi)
-
-    # The "trusted" expectation_values
-    expectation_vals_naive = np.zeros((n_operators, n_states), dtype=np.complex128)
 
-    # For a single state, test that a 1D array works
     if n_states == 1:
-        expectation_vals_naive = expectation_vals_naive[:, 0]
-
-    # Calculate using brute force
-    for k in range(n_operators):
-        A_k = pp.helpers.naive_pauli_operator(coeffs_2d[:, k], pauli_strings_str)
-        expectation_vals_naive[k] = np.einsum("it,ij,jt->t", psi.conj(), A_k, psi)
+        psi = np.random.rand(2**n_qubits).astype(np.complex128)
+        op = summed_pauli_op(pauli_strings, coeffs_2d)
+        # The expectation values we want to test
+        expectation_vals = op.expectation_value(psi)
+        # The "trusted" expectation_values
+        expectation_vals_naive = np.zeros((n_operators), dtype=np.complex128)
+        for k in range(n_operators):
+            A_k = pp.helpers.naive_pauli_operator(coeffs_2d[:, k], pauli_strings_str)
+            expectation_vals_naive[k] = np.einsum("i,ij,j->", psi.conj(), A_k, psi)
+
+    else:
+        psi = np.random.rand(2**n_qubits, n_states).astype(np.complex128)
+        op = summed_pauli_op(pauli_strings, coeffs_2d)
+        # The expectation values we want to test
+        expectation_vals = op.expectation_value(psi)
+        # The "trusted" expectation_values
+        expectation_vals_naive = np.zeros((n_operators, n_states), dtype=np.complex128)
+
+        for k in range(n_operators):
+            A_k = pp.helpers.naive_pauli_operator(coeffs_2d[:, k], pauli_strings_str)
+            expectation_vals_naive[k] = np.einsum("it,ij,jt->t", psi.conj(), A_k, psi)
 
     # Check
     np.testing.assert_allclose(expectation_vals, expectation_vals_naive, atol=1e-13)