From 99500ca88cdf891e57ce046b1c08b51a6372a1e3 Mon Sep 17 00:00:00 2001 From: Giovanni Pederiva Date: Fri, 11 Jun 2021 13:07:07 +0200 Subject: [PATCH 1/3] Modified QDPCloverTerm to support SWF --- .../clover_term_qdp_stabilized_helpers.h | 224 ++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 lib/actions/ferm/linop/clover_term_qdp_stabilized_helpers.h diff --git a/lib/actions/ferm/linop/clover_term_qdp_stabilized_helpers.h b/lib/actions/ferm/linop/clover_term_qdp_stabilized_helpers.h new file mode 100644 index 000000000..256cc18f2 --- /dev/null +++ b/lib/actions/ferm/linop/clover_term_qdp_stabilized_helpers.h @@ -0,0 +1,224 @@ +//! Exponential of triangular form matrix for spinors + +#ifndef __clover_term_qdp_stabilized_helpers_h__ +#define __clover_term_qdp_stabilized_helpers_h__ + +#include "state.h" +#include "qdp_allocator.h" + +#include +#include + +namespace Chroma{ + + template struct PrimitiveClovTriang; + + template + void tri6_mult(PrimitiveClovTriang& A, const PrimitiveClovTriang& B, const PrimitiveClovTriang& C, int i){ + // Diagonal Terms: + A.diag[i][0] = B.diag[i][0].elem() * C.diag[i][0].elem() + + B.offd[i][0].real() * C.offd[i][0].real() + B.offd[i][0].imag() * C.offd[i][0].imag() + + B.offd[i][1].real() * C.offd[i][1].real() + B.offd[i][1].imag() * C.offd[i][1].imag() + + B.offd[i][3].real() * C.offd[i][3].real() + B.offd[i][3].imag() * C.offd[i][3].imag() + + B.offd[i][6].real() * C.offd[i][6].real() + B.offd[i][6].imag() * C.offd[i][6].imag() + + B.offd[i][10].real() * C.offd[i][10].real() + B.offd[i][10].imag() * C.offd[i][10].imag(); + + A.diag[i][1] = B.diag[i][1].elem() * C.diag[i][1].elem() + + B.offd[i][0].real() * C.offd[i][0].real() + B.offd[i][0].imag() * C.offd[i][0].imag() + + B.offd[i][2].real() * C.offd[i][2].real() + B.offd[i][2].imag() * C.offd[i][2].imag() + + B.offd[i][4].real() * C.offd[i][4].real() + B.offd[i][4].imag() * C.offd[i][4].imag() + + B.offd[i][7].real() * C.offd[i][7].real() + B.offd[i][7].imag() * C.offd[i][7].imag() + + B.offd[i][11].real() * C.offd[i][11].real() + B.offd[i][11].imag() * C.offd[i][11].imag(); + + A.diag[i][2] = B.diag[i][2].elem() * C.diag[i][2].elem() + + B.offd[i][1].real() * C.offd[i][1].real() + B.offd[i][1].imag() * C.offd[i][1].imag() + + B.offd[i][2].real() * C.offd[i][2].real() + B.offd[i][2].imag() * C.offd[i][2].imag() + + B.offd[i][5].real() * C.offd[i][5].real() + B.offd[i][5].imag() * C.offd[i][5].imag() + + B.offd[i][8].real() * C.offd[i][8].real() + B.offd[i][8].imag() * C.offd[i][8].imag() + + B.offd[i][12].real() * C.offd[i][12].real() + B.offd[i][12].imag() * C.offd[i][12].imag(); + + A.diag[i][3] = B.diag[i][3].elem() * C.diag[i][3].elem() + + B.offd[i][3].real() * C.offd[i][3].real() + B.offd[i][3].imag() * C.offd[i][3].imag() + + B.offd[i][4].real() * C.offd[i][4].real() + B.offd[i][4].imag() * C.offd[i][4].imag() + + B.offd[i][5].real() * C.offd[i][5].real() + B.offd[i][5].imag() * C.offd[i][5].imag() + + B.offd[i][9].real() * C.offd[i][9].real() + B.offd[i][9].imag() * C.offd[i][9].imag() + + B.offd[i][13].real() * C.offd[i][13].real() + B.offd[i][13].imag() * C.offd[i][13].imag(); + + A.diag[i][4] = B.diag[i][4].elem() * C.diag[i][4].elem() + + B.offd[i][6].real() * C.offd[i][6].real() + B.offd[i][6].imag() * C.offd[i][6].imag() + + B.offd[i][7].real() * C.offd[i][7].real() + B.offd[i][7].imag() * C.offd[i][7].imag() + + B.offd[i][8].real() * C.offd[i][8].real() + B.offd[i][8].imag() * C.offd[i][8].imag() + + B.offd[i][9].real() * C.offd[i][9].real() + B.offd[i][9].imag() * C.offd[i][9].imag() + + B.offd[i][14].real() * C.offd[i][14].real() + B.offd[i][14].imag() * C.offd[i][14].imag(); + + A.diag[i][5] = B.diag[i][5].elem() * C.diag[i][5].elem() + + B.offd[i][10].real() * C.offd[i][10].real() + B.offd[i][10].imag() * C.offd[i][10].imag() + + B.offd[i][11].real() * C.offd[i][11].real() + B.offd[i][11].imag() * C.offd[i][11].imag() + + B.offd[i][12].real() * C.offd[i][12].real() + B.offd[i][12].imag() * C.offd[i][12].imag() + + B.offd[i][13].real() * C.offd[i][13].real() + B.offd[i][13].imag() * C.offd[i][13].imag() + + B.offd[i][14].real() * C.offd[i][14].real() + B.offd[i][14].imag() * C.offd[i][14].imag(); + + // Off-diagonal Terms: + A.offd[i][0] = B.diag[i][0] * C.offd[i][0] + B.offd[i][0] * C.diag[i][1] + + B.offd[i][1] * conj(C.offd[i][2]) + B.offd[i][3] * conj(C.offd[i][4]) + + B.offd[i][6] * conj(C.offd[i][7]) + B.offd[i][10] * conj(C.offd[i][11]); + + A.offd[i][1] = B.diag[i][0] * C.offd[i][1] + B.offd[i][0] * C.offd[i][2] + + B.offd[i][1] * C.diag[i][2] + B.offd[i][3] * conj(C.offd[i][5]) + + B.offd[i][6] * conj(C.offd[i][8]) + B.offd[i][10] * conj(C.offd[i][12]); + + A.offd[i][2] = conj(B.offd[i][0]) * C.offd[i][1] + B.diag[i][1] * C.offd[i][2] + + B.offd[i][2] * C.diag[i][2] + B.offd[i][4] * conj(C.offd[i][5]) + + B.offd[i][7] * conj(C.offd[i][8]) + B.offd[i][11] * conj(C.offd[i][12]); + + A.offd[i][3] = B.diag[i][0] * C.offd[i][3] + B.offd[i][0] * C.offd[i][4] + + B.offd[i][1] * C.offd[i][5] + B.offd[i][3] * C.diag[i][3] + + B.offd[i][6] * conj(C.offd[i][9]) + B.offd[i][10] * conj(C.offd[i][13]); + + A.offd[i][4] = conj(B.offd[i][0]) * C.offd[i][3] + B.diag[i][1] * C.offd[i][4] + + B.offd[i][2] * C.offd[i][5] + B.offd[i][4] * C.diag[i][3] + + B.offd[i][7] * conj(C.offd[i][9]) + B.offd[i][11] * conj(C.offd[i][13]); + + A.offd[i][5] = conj(B.offd[i][1]) * C.offd[i][3] + conj(B.offd[i][2]) * C.offd[i][4] + + B.diag[i][2] * C.offd[i][5] + B.offd[i][5] * C.diag[i][3] + + B.offd[i][8] * conj(C.offd[i][9]) + B.offd[i][12] * conj(C.offd[i][13]); + + A.offd[i][6] = B.diag[i][0] * C.offd[i][6] + B.offd[i][0] * C.offd[i][7] + + B.offd[i][1] * C.offd[i][8] + B.offd[i][3] * C.offd[i][9] + + B.offd[i][6] * C.diag[i][4] + B.offd[i][10] * conj(C.offd[i][14]); + + A.offd[i][7] = conj(B.offd[i][0]) * C.offd[i][6] + B.diag[i][1] * C.offd[i][7] + + B.offd[i][2] * C.offd[i][8] + B.offd[i][4] * C.offd[i][9] + + B.offd[i][7] * C.diag[i][4] + B.offd[i][11] * conj(C.offd[i][14]); + + A.offd[i][8] = conj(B.offd[i][1]) * C.offd[i][6] + conj(B.offd[i][2]) * C.offd[i][7] + + B.diag[i][2] * C.offd[i][8] + B.offd[i][5] * C.offd[i][9] + + B.offd[i][8] * C.diag[i][4] + B.offd[i][12] * conj(C.offd[i][14]); + + A.offd[i][9] = conj(B.offd[i][3]) * C.offd[i][6] + conj(B.offd[i][4]) * C.offd[i][7] + + conj(B.offd[i][5]) * C.offd[i][8] + B.diag[i][3] * C.offd[i][9] + + B.offd[i][9] * C.diag[i][4] + B.offd[i][13] * conj(C.offd[i][14]); + + A.offd[i][10] = B.diag[i][0] * C.offd[i][10] + B.offd[i][0] * C.offd[i][11] + + B.offd[i][1] * C.offd[i][12] + B.offd[i][3] * C.offd[i][13] + + B.offd[i][6] * C.offd[i][14] + B.offd[i][10] * C.diag[i][5]; + + A.offd[i][11] = conj(B.offd[i][0]) * C.offd[i][10] + B.diag[i][1] * C.offd[i][11] + + B.offd[i][2] * C.offd[i][12] + B.offd[i][4] * C.offd[i][13] + + B.offd[i][7] * C.offd[i][14] + B.offd[i][11] * C.diag[i][5]; + + + A.offd[i][12] = conj(B.offd[i][1]) * C.offd[i][10] + conj(B.offd[i][2]) * C.offd[i][11] + + B.diag[i][2] * C.offd[i][12] + B.offd[i][5] * C.offd[i][13] + + B.offd[i][8] * C.offd[i][14] + B.offd[i][12] * C.diag[i][5]; + + A.offd[i][13] = conj(B.offd[i][3]) * C.offd[i][10] + conj(B.offd[i][4]) * C.offd[i][11] + + conj(B.offd[i][5]) * C.offd[i][12] + B.diag[i][3] * C.offd[i][13] + + B.offd[i][9] * C.offd[i][14] + B.offd[i][13] * C.diag[i][5]; + + A.offd[i][14] = conj(B.offd[i][6]) * C.offd[i][10] + conj(B.offd[i][7]) * C.offd[i][11] + + conj(B.offd[i][8]) * C.offd[i][12] + conj(B.offd[i][9]) * C.offd[i][13] + + B.diag[i][4] * C.offd[i][14] + B.offd[i][14] * C.diag[i][5]; + + } + + //! Computes Tr(A) + template + RScalar tri6_trace(const PrimitiveClovTriang& A, int comp){ + RScalar tr = 0.0; + for (size_t i = 0; i < 2*Nc; i++) + tr += A.diag[comp][i]; + return tr; + } + + // Computes Tr(A*B) using hermiticity of the matrices + template + RScalar tri6_trace_mul(const PrimitiveClovTriang& A, const PrimitiveClovTriang& B, int comp){ + RScalar trd = 0.0, tro = 0.0; + for (size_t i = 0; i < 2*Nc; i++) + trd += A.diag[comp][i] * B.diag[comp][i]; + + for (size_t i = 0; i < 2*Nc*Nc-Nc; i++) + tro += RScalar(A.offd[comp][i].real() * B.offd[comp][i].real() + A.offd[comp][i].imag() * B.offd[comp][i].imag()); + return trd + tro + tro; + } + + template + void exponentiate(PrimitiveClovTriang& A, int sign){ + const int Niter = 17; + PrimitiveClovTriang A2, A3, tmp; + for(size_t comp = 0; comp < 2; comp++){ + + tri6_mult(A2, A, A, comp); + tri6_mult(A3, A, A2, comp); + + std::array, 7> tr; + tr[0] = 0; + tr[1] = 0; + tr[2] = tri6_trace(A2, comp); + tr[3] = tri6_trace(A3, comp); + tr[4] = tri6_trace_mul(A2, A2, comp); + tr[5] = tri6_trace_mul(A2, A3, comp); + tr[6] = tri6_trace_mul(A3, A3, comp); + + std::array, 5> p; + p[0] = RScalar(-1.0/6.0)*tr[6] + RScalar(1.0/18.0)*tr[3]*tr[3] + - RScalar(1.0/48.0)*tr[2]*tr[2]*tr[2] + RScalar(1.0/8.0)*tr[4]*tr[2]; + p[1] = RScalar(-1.0/5.0)*tr[5] + RScalar(1.0/6.0) *tr[2]*tr[3]; + p[2] = RScalar(-1.0/4.0)*tr[4] + RScalar(1.0/8.0) *tr[2]*tr[2]; + p[3] = RScalar(-1.0/3.0)*tr[3]; + p[4] = RScalar(-1.0/2.0)*tr[2]; + + std::array, Niter+1> c; + c[0] = 1; + for (size_t i = 1; i < Niter+1; i++) + c[i] = c[i - 1] / RScalar(i); + if (sign == 1) + for (size_t i = 1; i < Niter+1; i += 2) + c[i] = -c[i]; + + std::array, 6> q; + if (Niter > 5){ + int ic = Niter - 6; + for (size_t i = 0; i < 6; i++) + q[i] = c[ic+1+i]; + + while (ic >= 0) { + RScalar q5 = q[5]; + q[5] = q[4]; + q[4] = q[3] - q5 * p[4]; + q[3] = q[2] - q5 * p[3]; + q[2] = q[1] - q5 * p[2]; + q[1] = q[0] - q5 * p[1]; + q[0] = c[ic] - q5 * p[0]; + ic -= 1; + } + } + else { + QDPIO::cerr << "error" << std::endl; + } + + // I*q0 + A*q1 + A^2*q2 + A^3*q3 + for (size_t i = 0; i < 2*Nc; i++) + A.diag[comp][i] = q[0] + A.diag[comp][i]*q[1] + A2.diag[comp][i]*q[2] + A3.diag[comp][i]*q[3]; + for (size_t i = 0; i < 2*Nc*Nc-Nc; i++) + A.offd[comp][i] = A.offd[comp][i]*q[1] + A2.offd[comp][i]*q[2] + A3.offd[comp][i]*q[3]; + + // A^4*q4 + tri6_mult(tmp, A2, A2, comp); + for (size_t i = 0; i < 2*Nc; i++) + A.diag[comp][i] += tmp.diag[comp][i]*q[4]; + for (size_t i = 0; i < 2*Nc*Nc-Nc; i++) + A.offd[comp][i] += tmp.offd[comp][i]*q[4]; + + // A^5*q5 + tri6_mult(tmp, A3, A2, comp); + for (size_t i = 0; i < 2*Nc; i++) + A.diag[comp][i] += tmp.diag[comp][i]*q[5]; + for (size_t i = 0; i < 2*Nc*Nc-Nc; i++) + A.offd[comp][i] += tmp.offd[comp][i]*q[5]; + } + } +} + +#endif \ No newline at end of file From 204642a7f459a0c60268e57e58455d012ea9a889 Mon Sep 17 00:00:00 2001 From: Giovanni Pederiva Date: Fri, 11 Jun 2021 13:10:02 +0200 Subject: [PATCH 2/3] Modified QDPCloverTerm to support SWF --- lib/Makefile.am | 1 + .../ferm/fermacts/clover_fermact_params_w.cc | 14 +++- .../ferm/fermacts/clover_fermact_params_w.h | 2 + lib/actions/ferm/linop/clover_term_qdp_w.h | 64 +++++++++++++++---- 4 files changed, 69 insertions(+), 12 deletions(-) diff --git a/lib/Makefile.am b/lib/Makefile.am index 1f1d2a450..bc21a5a30 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -575,6 +575,7 @@ nobase_include_HEADERS += \ actions/ferm/linop/clover_term_w.h \ actions/ferm/linop/clover_term_base_w.h \ actions/ferm/linop/clover_term_qdp_w.h \ + actions/ferm/linop/clover_term_qdp_stabilized_helpers.h \ actions/ferm/linop/eoprec_clover_linop_w.h \ actions/ferm/linop/seoprec_clover_linop_w.h \ actions/ferm/linop/shifted_linop_w.h \ diff --git a/lib/actions/ferm/fermacts/clover_fermact_params_w.cc b/lib/actions/ferm/fermacts/clover_fermact_params_w.cc index bd057c8a1..f0bffe5dc 100644 --- a/lib/actions/ferm/fermacts/clover_fermact_params_w.cc +++ b/lib/actions/ferm/fermacts/clover_fermact_params_w.cc @@ -20,7 +20,7 @@ namespace Chroma max_norm_usedP=false; twisted_m_usedP = false; twisted_m = Real(0); - + stabilized_wilson = false; } //! Read parameters @@ -96,6 +96,14 @@ namespace Chroma twisted_m_usedP = false; } + if( paramtop.count("Stabilized") != 0 ) { + stabilized_wilson = true; + read(paramtop, "Stabilized", stabilized_wilson); + } + else { + stabilized_wilson = false; + } + } //! Read parameters @@ -133,6 +141,10 @@ namespace Chroma write(xml, "TwistedM", param.twisted_m); } + if (param.stabilized_wilson == true ) { + write(xml, "Stabilized", param.stabilized_wilson); + } + pop(xml); } diff --git a/lib/actions/ferm/fermacts/clover_fermact_params_w.h b/lib/actions/ferm/fermacts/clover_fermact_params_w.h index 0553ffb78..1fd9cf0ff 100644 --- a/lib/actions/ferm/fermacts/clover_fermact_params_w.h +++ b/lib/actions/ferm/fermacts/clover_fermact_params_w.h @@ -32,6 +32,8 @@ namespace Chroma Real twisted_m; bool twisted_m_usedP; + bool stabilized_wilson; + }; diff --git a/lib/actions/ferm/linop/clover_term_qdp_w.h b/lib/actions/ferm/linop/clover_term_qdp_w.h index 55cd33b75..5ce9223de 100644 --- a/lib/actions/ferm/linop/clover_term_qdp_w.h +++ b/lib/actions/ferm/linop/clover_term_qdp_w.h @@ -11,6 +11,7 @@ #include "actions/ferm/fermacts/clover_fermact_params_w.h" #include "actions/ferm/linop/clover_term_base_w.h" #include "meas/glue/mesfield.h" +#include "actions/ferm/linop/clover_term_qdp_stabilized_helpers.h" #include namespace Chroma { @@ -132,7 +133,7 @@ namespace Chroma * \param f field strength tensor F(mu,nu) (Read) * \param cb checkerboard (Read) */ - void makeClov(const multi1d& f, const RealT& diag_mass); + void makeClov(const multi1d& f, const RealT& diag_mass, const bool stabilized); //! Invert the clover term on cb void chlclovms(LatticeREAL& log_diag, int cb); @@ -280,7 +281,7 @@ namespace Chroma /* Calculate F(mu,nu) */ multi1d f; mesField(f, u); - makeClov(f, diag_mass); + makeClov(f, diag_mass, param.stabilized_wilson); choles_done.resize(rb.numSubsets()); @@ -390,6 +391,7 @@ namespace Chroma const U& f3; const U& f4; const U& f5; + const bool stabilized; PrimitiveClovTriang < REALT >* tri; }; @@ -410,19 +412,32 @@ namespace Chroma const U& f3=a->f3; const U& f4=a->f4; const U& f5=a->f5; + const bool stabilized=a->stabilized; PrimitiveClovTriang < REALT >* tri=a->tri; // SITE LOOP STARTS HERE for(int site = lo; site < hi; ++site) { /*# Construct diagonal */ - for(int jj = 0; jj < 2; jj++) { - - for(int ii = 0; ii < 2*Nc; ii++) { - - tri[site].diag[jj][ii] = diag_mass.elem().elem().elem(); - } - } + if(stabilized == true){ + for(int jj = 0; jj < 2; jj++) { + + for(int ii = 0; ii < 2*Nc; ii++) { + + tri[site].diag[jj][ii] = 0; + } + } + } + else{ + for(int jj = 0; jj < 2; jj++) { + + for(int ii = 0; ii < 2*Nc; ii++) { + + tri[site].diag[jj][ii] = diag_mass.elem().elem().elem(); + } + } + } + @@ -516,6 +531,24 @@ namespace Chroma tri[site].offd[1][elem_ij] = E_minus + B_minus; } } + + + // Apply exponential + if(stabilized == true){ + exponentiate(tri[site], 0); + // fix constants here + for(int jj = 0; jj < 2; jj++) { + for(int ii = 0; ii < 6; ii++) { + tri[site].diag[jj][ii] *= diag_mass.elem().elem().elem(); + } + } + for(int jj = 0; jj < 2; jj++) { + for(int ii = 0; ii < 15; ii++) { + tri[site].offd[jj][ii] *= diag_mass.elem().elem().elem(); + } + } + } + } /* End Site loop */ #endif } /* Function */ @@ -523,7 +556,7 @@ namespace Chroma /* This now just sets up and dispatches... */ template - void QDPCloverTermT::makeClov(const multi1d& f, const RealT& diag_mass) + void QDPCloverTermT::makeClov(const multi1d& f, const RealT& diag_mass, const bool stabilized) { START_CODE(); @@ -544,8 +577,17 @@ namespace Chroma U f4 = f[4] * getCloverCoeff(1,3); U f5 = f[5] * getCloverCoeff(2,3); + if (stabilized == true){ + f0 /= diag_mass; + f1 /= diag_mass; + f2 /= diag_mass; + f3 /= diag_mass; + f4 /= diag_mass; + f5 /= diag_mass; + } + const int nodeSites = QDP::Layout::sitesOnNode(); - QDPCloverEnv::QDPCloverMakeClovArg arg = {diag_mass, f0,f1,f2,f3,f4,f5,tri }; + QDPCloverEnv::QDPCloverMakeClovArg arg = {diag_mass, f0,f1,f2,f3,f4,f5,stabilized,tri }; dispatch_to_threads(nodeSites, arg, QDPCloverEnv::makeClovSiteLoop); From 66e0e3db4b792dafa5e1156d4fa41b368373db82 Mon Sep 17 00:00:00 2001 From: Giovanni Pederiva Date: Tue, 13 Feb 2024 12:33:36 +0100 Subject: [PATCH 3/3] updated jit expo clover to latest qdpjit --- lib/CMakeLists.txt | 5 +- .../clover_term_jit_stabilized_helpers.h | 411 ++++++++++++++++++ lib/actions/ferm/linop/clover_term_jit_w.h | 56 ++- 3 files changed, 461 insertions(+), 11 deletions(-) create mode 100644 lib/actions/ferm/linop/clover_term_jit_stabilized_helpers.h diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index b6bb2f080..e894cd14c 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1920,7 +1920,10 @@ endif() # JIT CLOVER/STOUTING ####################################################### if( Chroma_ENABLE_JIT_CLOVER ) - list(APPEND ChromaLIB_HEADERS actions/ferm/linop/clover_term_jit_w.h) + list(APPEND ChromaLIB_HEADERS + actions/ferm/linop/clover_term_jit_w.h + actions/ferm/linop/clover_term_jit_stabilized_helpers.h + ) target_sources(chromalib PRIVATE util/gauge/stout_utils_jit.cc) endif() diff --git a/lib/actions/ferm/linop/clover_term_jit_stabilized_helpers.h b/lib/actions/ferm/linop/clover_term_jit_stabilized_helpers.h new file mode 100644 index 000000000..01182fdbc --- /dev/null +++ b/lib/actions/ferm/linop/clover_term_jit_stabilized_helpers.h @@ -0,0 +1,411 @@ +//! Exponential of triangular form matrix for spinors + +#ifndef __clover_term_jit_stabilized_helpers_h__ +#define __clover_term_jit_stabilized_helpers_h__ + +#include "state.h" + +namespace QDP{ + template struct PCompJIT; + template struct PTriDiaJIT; + template struct PTriOffJIT; + template struct PCompREG; + template struct PTriDiaREG; + template struct PTriOffREG; +} + + +namespace Chroma{ + + template + void tri6_mult(QDP::PCompREG>>>& ADia, + QDP::PCompREG>>>& AOff, + QDP::PCompREG>>>& BDia, + QDP::PCompREG>>>& BOff, + QDP::PCompREG>>>& CDia, + QDP::PCompREG>>>& COff, + int i){ + // Diagonal Terms: + + ADia.elem(i).elem(0) = BDia.elem(i).elem(0) * CDia.elem(i).elem(0) + + real(BOff.elem(i).elem(0) ) * real(COff.elem(i).elem(0) ) + imag(BOff.elem(i).elem(0) ) * imag(COff.elem(i).elem(0) ) + + real(BOff.elem(i).elem(1) ) * real(COff.elem(i).elem(1) ) + imag(BOff.elem(i).elem(1) ) * imag(COff.elem(i).elem(1) ) + + real(BOff.elem(i).elem(3) ) * real(COff.elem(i).elem(3) ) + imag(BOff.elem(i).elem(3) ) * imag(COff.elem(i).elem(3) ) + + real(BOff.elem(i).elem(6) ) * real(COff.elem(i).elem(6) ) + imag(BOff.elem(i).elem(6) ) * imag(COff.elem(i).elem(6) ) + + real(BOff.elem(i).elem(10)) * real(COff.elem(i).elem(10)) + imag(BOff.elem(i).elem(10)) * imag(COff.elem(i).elem(10)); + + ADia.elem(i).elem(1) = BDia.elem(i).elem(1) * CDia.elem(i).elem(1) + + real(BOff.elem(i).elem(0) ) * real(COff.elem(i).elem(0) ) + imag(BOff.elem(i).elem(0) ) * imag(COff.elem(i).elem(0) ) + + real(BOff.elem(i).elem(2) ) * real(COff.elem(i).elem(2) ) + imag(BOff.elem(i).elem(2) ) * imag(COff.elem(i).elem(2) ) + + real(BOff.elem(i).elem(4) ) * real(COff.elem(i).elem(4) ) + imag(BOff.elem(i).elem(4) ) * imag(COff.elem(i).elem(4) ) + + real(BOff.elem(i).elem(7) ) * real(COff.elem(i).elem(7) ) + imag(BOff.elem(i).elem(7) ) * imag(COff.elem(i).elem(7) ) + + real(BOff.elem(i).elem(11)) * real(COff.elem(i).elem(11)) + imag(BOff.elem(i).elem(11)) * imag(COff.elem(i).elem(11)); + + ADia.elem(i).elem(2) = BDia.elem(i).elem(2) * CDia.elem(i).elem(2) + + real(BOff.elem(i).elem(1) ) * real(COff.elem(i).elem(1) ) + imag(BOff.elem(i).elem(1) ) * imag(COff.elem(i).elem(1) ) + + real(BOff.elem(i).elem(2) ) * real(COff.elem(i).elem(2) ) + imag(BOff.elem(i).elem(2) ) * imag(COff.elem(i).elem(2) ) + + real(BOff.elem(i).elem(5) ) * real(COff.elem(i).elem(5) ) + imag(BOff.elem(i).elem(5) ) * imag(COff.elem(i).elem(5) ) + + real(BOff.elem(i).elem(8) ) * real(COff.elem(i).elem(8) ) + imag(BOff.elem(i).elem(8) ) * imag(COff.elem(i).elem(8) ) + + real(BOff.elem(i).elem(12)) * real(COff.elem(i).elem(12)) + imag(BOff.elem(i).elem(12)) * imag(COff.elem(i).elem(12)); + + ADia.elem(i).elem(3) = BDia.elem(i).elem(3) * CDia.elem(i).elem(3) + + real(BOff.elem(i).elem(3) ) * real(COff.elem(i).elem(3) ) + imag(BOff.elem(i).elem(3) ) * imag(COff.elem(i).elem(3) ) + + real(BOff.elem(i).elem(4) ) * real(COff.elem(i).elem(4) ) + imag(BOff.elem(i).elem(4) ) * imag(COff.elem(i).elem(4) ) + + real(BOff.elem(i).elem(5) ) * real(COff.elem(i).elem(5) ) + imag(BOff.elem(i).elem(5) ) * imag(COff.elem(i).elem(5) ) + + real(BOff.elem(i).elem(9) ) * real(COff.elem(i).elem(9) ) + imag(BOff.elem(i).elem(9) ) * imag(COff.elem(i).elem(9) ) + + real(BOff.elem(i).elem(13)) * real(COff.elem(i).elem(13)) + imag(BOff.elem(i).elem(13)) * imag(COff.elem(i).elem(13)); + + ADia.elem(i).elem(4) = BDia.elem(i).elem(4) * CDia.elem(i).elem(4) + + real(BOff.elem(i).elem(6) ) * real(COff.elem(i).elem(6) ) + imag(BOff.elem(i).elem(6) ) * imag(COff.elem(i).elem(6) ) + + real(BOff.elem(i).elem(7) ) * real(COff.elem(i).elem(7) ) + imag(BOff.elem(i).elem(7) ) * imag(COff.elem(i).elem(7) ) + + real(BOff.elem(i).elem(8) ) * real(COff.elem(i).elem(8) ) + imag(BOff.elem(i).elem(8) ) * imag(COff.elem(i).elem(8) ) + + real(BOff.elem(i).elem(9) ) * real(COff.elem(i).elem(9) ) + imag(BOff.elem(i).elem(9) ) * imag(COff.elem(i).elem(9) ) + + real(BOff.elem(i).elem(14)) * real(COff.elem(i).elem(14)) + imag(BOff.elem(i).elem(14)) * imag(COff.elem(i).elem(14)); + + ADia.elem(i).elem(5) = BDia.elem(i).elem(5) * CDia.elem(i).elem(5) + + real(BOff.elem(i).elem(10)) * real(COff.elem(i).elem(10)) + imag(BOff.elem(i).elem(10)) * imag(COff.elem(i).elem(10)) + + real(BOff.elem(i).elem(11)) * real(COff.elem(i).elem(11)) + imag(BOff.elem(i).elem(11)) * imag(COff.elem(i).elem(11)) + + real(BOff.elem(i).elem(12)) * real(COff.elem(i).elem(12)) + imag(BOff.elem(i).elem(12)) * imag(COff.elem(i).elem(12)) + + real(BOff.elem(i).elem(13)) * real(COff.elem(i).elem(13)) + imag(BOff.elem(i).elem(13)) * imag(COff.elem(i).elem(13)) + + real(BOff.elem(i).elem(14)) * real(COff.elem(i).elem(14)) + imag(BOff.elem(i).elem(14)) * imag(COff.elem(i).elem(14)); + + // Off-diagonal Terms: + AOff.elem(i).elem(0) = BDia.elem(i).elem(0) * COff.elem(i).elem(0) + + BOff.elem(i).elem(0) * CDia.elem(i).elem(1) + + BOff.elem(i).elem(1) * conj(COff.elem(i).elem(2)) + + BOff.elem(i).elem(3) * conj(COff.elem(i).elem(4)) + + BOff.elem(i).elem(6) * conj(COff.elem(i).elem(7)) + + BOff.elem(i).elem(10)* conj(COff.elem(i).elem(11)); + // A.offd[i][0] = B.diag[i][0] * C.offd[i][0] + // + B.offd[i][0] * C.diag[i][1] + // + B.offd[i][1] * conj(C.offd[i][2]) + // + B.offd[i][3] * conj(C.offd[i][4]) + // + B.offd[i][6] * conj(C.offd[i][7]) + // + B.offd[i][10] * conj(C.offd[i][11]); + + AOff.elem(i).elem(1) = BDia.elem(i).elem(0) * COff.elem(i).elem(1) + + BOff.elem(i).elem(0) * COff.elem(i).elem(2) + + BOff.elem(i).elem(1) * CDia.elem(i).elem(2) + + BOff.elem(i).elem(3) * conj(COff.elem(i).elem(5)) + + BOff.elem(i).elem(6) * conj(COff.elem(i).elem(8)) + + BOff.elem(i).elem(10)* conj(COff.elem(i).elem(12)); + // A.offd[i][1] = B.diag[i][0] * C.offd[i][1] + // + B.offd[i][0] * C.offd[i][2] + // + B.offd[i][1] * C.diag[i][2] + // + B.offd[i][3] * conj(C.offd[i][5]) + // + B.offd[i][6] * conj(C.offd[i][8]) + // + B.offd[i][10] * conj(C.offd[i][12]); + + AOff.elem(i).elem(2) = conj(BOff.elem(i).elem(0)) * COff.elem(i).elem(1) + + BDia.elem(i).elem(1) * COff.elem(i).elem(2) + + BOff.elem(i).elem(2) * CDia.elem(i).elem(2) + + BOff.elem(i).elem(4) * conj(COff.elem(i).elem(5)) + + BOff.elem(i).elem(7) * conj(COff.elem(i).elem(8)) + + BOff.elem(i).elem(11) * conj(COff.elem(i).elem(12)); + // A.offd[i][2] = conj(B.offd[i][0]) * C.offd[i][1] + // + B.diag[i][1] * C.offd[i][2] + // + B.offd[i][2] * C.diag[i][2] + // + B.offd[i][4] * conj(C.offd[i][5]) + // + B.offd[i][7] * conj(C.offd[i][8]) + // + B.offd[i][11] * conj(C.offd[i][12]); + + AOff.elem(i).elem(3) = BDia.elem(i).elem(0) * COff.elem(i).elem(3) + + BOff.elem(i).elem(0) * COff.elem(i).elem(4) + + BOff.elem(i).elem(1) * COff.elem(i).elem(5) + + BOff.elem(i).elem(3) * CDia.elem(i).elem(3) + + BOff.elem(i).elem(6) * conj(COff.elem(i).elem(9)) + + BOff.elem(i).elem(10)* conj(COff.elem(i).elem(13)); + // A.offd[i][3] = B.diag[i][0] * C.offd[i][3] + // + B.offd[i][0] * C.offd[i][4] + // + B.offd[i][1] * C.offd[i][5] + // + B.offd[i][3] * C.diag[i][3] + // + B.offd[i][6] * conj(C.offd[i][9]) + // + B.offd[i][10] * conj(C.offd[i][13]); + + AOff.elem(i).elem(4) = conj(BOff.elem(i).elem(0)) * COff.elem(i).elem(3) + + BDia.elem(i).elem(1) * COff.elem(i).elem(4) + + BOff.elem(i).elem(2) * COff.elem(i).elem(5) + + BOff.elem(i).elem(4) * CDia.elem(i).elem(3) + + BOff.elem(i).elem(7) * conj(COff.elem(i).elem(9)) + + BOff.elem(i).elem(11) * conj(COff.elem(i).elem(13)); + // A.offd[i][4] = conj(B.offd[i][0]) * C.offd[i][3] + // + B.diag[i][1] * C.offd[i][4] + // + B.offd[i][2] * C.offd[i][5] + // + B.offd[i][4] * C.diag[i][3] + // + B.offd[i][7] * conj(C.offd[i][9]) + // + B.offd[i][11] * conj(C.offd[i][13]); + + AOff.elem(i).elem(5) = conj(BOff.elem(i).elem(1)) * COff.elem(i).elem(3) + + conj(BOff.elem(i).elem(2)) * COff.elem(i).elem(4) + + BDia.elem(i).elem(2) * COff.elem(i).elem(5) + + BOff.elem(i).elem(5) * CDia.elem(i).elem(3) + + BOff.elem(i).elem(8) * conj(COff.elem(i).elem(9)) + + BOff.elem(i).elem(12) * conj(COff.elem(i).elem(13)); + // A.offd[i][5] = conj(B.offd[i][1]) * C.offd[i][3] + // + conj(B.offd[i][2]) * C.offd[i][4] + // + B.diag[i][2] * C.offd[i][5] + // + B.offd[i][5] * C.diag[i][3] + // + B.offd[i][8] * conj(C.offd[i][9]) + // + B.offd[i][12] * conj(C.offd[i][13]); + + AOff.elem(i).elem(6) = BDia.elem(i).elem(0) * COff.elem(i).elem(6) + + BOff.elem(i).elem(0) * COff.elem(i).elem(7) + + BOff.elem(i).elem(1) * COff.elem(i).elem(8) + + BOff.elem(i).elem(3) * COff.elem(i).elem(9) + + BOff.elem(i).elem(6) * CDia.elem(i).elem(4) + + BOff.elem(i).elem(10) * conj(COff.elem(i).elem(14)); + // A.offd[i][6] = B.diag[i][0] * C.offd[i][6] + // + B.offd[i][0] * C.offd[i][7] + // + B.offd[i][1] * C.offd[i][8] + // + B.offd[i][3] * C.offd[i][9] + // + B.offd[i][6] * C.diag[i][4] + // + B.offd[i][10] * conj(C.offd[i][14]); + + AOff.elem(i).elem(7) = conj(BOff.elem(i).elem(0)) * COff.elem(i).elem(6) + + BDia.elem(i).elem(1) * COff.elem(i).elem(7) + + BOff.elem(i).elem(2) * COff.elem(i).elem(8) + + BOff.elem(i).elem(4) * COff.elem(i).elem(9) + + BOff.elem(i).elem(7) * CDia.elem(i).elem(4) + + BOff.elem(i).elem(11) * conj(COff.elem(i).elem(14)); + // A.offd[i][7] = conj(B.offd[i][0]) * C.offd[i][6] + // + B.diag[i][1] * C.offd[i][7] + // + B.offd[i][2] * C.offd[i][8] + // + B.offd[i][4] * C.offd[i][9] + // + B.offd[i][7] * C.diag[i][4] + // + B.offd[i][11] * conj(C.offd[i][14]); + + AOff.elem(i).elem(8) = conj(BOff.elem(i).elem(1)) * COff.elem(i).elem(6) + + conj(BOff.elem(i).elem(2)) * COff.elem(i).elem(7) + + BDia.elem(i).elem(2) * COff.elem(i).elem(8) + + BOff.elem(i).elem(5) * COff.elem(i).elem(9) + + BOff.elem(i).elem(8) * CDia.elem(i).elem(4) + + BOff.elem(i).elem(12) * conj(COff.elem(i).elem(14)); + // A.offd[i][8] = conj(B.offd[i][1]) * C.offd[i][6] + // + conj(B.offd[i][2]) * C.offd[i][7] + // + B.diag[i][2] * C.offd[i][8] + // + B.offd[i][5] * C.offd[i][9] + // + B.offd[i][8] * C.diag[i][4] + // + B.offd[i][12] * conj(C.offd[i][14]); + + AOff.elem(i).elem(9) = conj(BOff.elem(i).elem(3)) * COff.elem(i).elem(6) + + conj(BOff.elem(i).elem(4)) * COff.elem(i).elem(7) + + conj(BOff.elem(i).elem(5)) * COff.elem(i).elem(8) + + BDia.elem(i).elem(3) * COff.elem(i).elem(9) + + BOff.elem(i).elem(9) * CDia.elem(i).elem(4) + + BOff.elem(i).elem(13) * conj(COff.elem(i).elem(14)); + // A.offd[i][9] = conj(B.offd[i][3]) * C.offd[i][6] + // + conj(B.offd[i][4]) * C.offd[i][7] + // + conj(B.offd[i][5]) * C.offd[i][8] + // + B.diag[i][3] * C.offd[i][9] + // + B.offd[i][9] * C.diag[i][4] + // + B.offd[i][13] * conj(C.offd[i][14]); + + AOff.elem(i).elem(10) = BDia.elem(i).elem(0) * COff.elem(i).elem(10) + + BOff.elem(i).elem(0) * COff.elem(i).elem(11) + + BOff.elem(i).elem(1) * COff.elem(i).elem(12) + + BOff.elem(i).elem(3) * COff.elem(i).elem(13) + + BOff.elem(i).elem(6) * COff.elem(i).elem(14) + + BOff.elem(i).elem(10) * CDia.elem(i).elem(5); + // A.offd[i][10] = B.diag[i][0] * C.offd[i][10] + // + B.offd[i][0] * C.offd[i][11] + // + B.offd[i][1] * C.offd[i][12] + // + B.offd[i][3] * C.offd[i][13] + // + B.offd[i][6] * C.offd[i][14] + // + B.offd[i][10] * C.diag[i][5]; + + AOff.elem(i).elem(11) = conj(BOff.elem(i).elem(0)) * COff.elem(i).elem(10) + + BDia.elem(i).elem(1) * COff.elem(i).elem(11) + + BOff.elem(i).elem(2) * COff.elem(i).elem(12) + + BOff.elem(i).elem(4) * COff.elem(i).elem(13) + + BOff.elem(i).elem(7) * COff.elem(i).elem(14) + + BOff.elem(i).elem(11) * CDia.elem(i).elem(5); + // A.offd[i][11] = conj(B.offd[i][0]) * C.offd[i][10] + // + B.diag[i][1] * C.offd[i][11] + // + B.offd[i][2] * C.offd[i][12] + // + B.offd[i][4] * C.offd[i][13] + // + B.offd[i][7] * C.offd[i][14] + // + B.offd[i][11] * C.diag[i][5]; + + AOff.elem(i).elem(12) = conj(BOff.elem(i).elem(1)) * COff.elem(i).elem(10) + + conj(BOff.elem(i).elem(2)) * COff.elem(i).elem(11) + + BDia.elem(i).elem(2) * COff.elem(i).elem(12) + + BOff.elem(i).elem(5) * COff.elem(i).elem(13) + + BOff.elem(i).elem(8) * COff.elem(i).elem(14) + + BOff.elem(i).elem(12) * CDia.elem(i).elem(5); + // A.offd[i][12] = conj(B.offd[i][1]) * C.offd[i][10] + // + conj(B.offd[i][2]) * C.offd[i][11] + // + B.diag[i][2] * C.offd[i][12] + // + B.offd[i][5] * C.offd[i][13] + // + B.offd[i][8] * C.offd[i][14] + // + B.offd[i][12] * C.diag[i][5]; + + AOff.elem(i).elem(13) = conj(BOff.elem(i).elem(3)) * COff.elem(i).elem(10) + + conj(BOff.elem(i).elem(4)) * COff.elem(i).elem(11) + + conj(BOff.elem(i).elem(5)) * COff.elem(i).elem(12) + + BDia.elem(i).elem(3) * COff.elem(i).elem(13) + + BOff.elem(i).elem(9) * COff.elem(i).elem(14) + + BOff.elem(i).elem(13) * CDia.elem(i).elem(5); + // A.offd[i][13] = conj(B.offd[i][3]) * C.offd[i][10] + // + conj(B.offd[i][4]) * C.offd[i][11] + // + conj(B.offd[i][5]) * C.offd[i][12] + // + B.diag[i][3] * C.offd[i][13] + // + B.offd[i][9] * C.offd[i][14] + // + B.offd[i][13] * C.diag[i][5]; + + AOff.elem(i).elem(14) = conj(BOff.elem(i).elem(6)) * COff.elem(i).elem(10) + + conj(BOff.elem(i).elem(7)) * COff.elem(i).elem(11) + + conj(BOff.elem(i).elem(8)) * COff.elem(i).elem(12) + + conj(BOff.elem(i).elem(9)) * COff.elem(i).elem(13) + + BDia.elem(i).elem(4) * COff.elem(i).elem(14) + + BOff.elem(i).elem(14) * CDia.elem(i).elem(5); + // A.offd[i][14] = conj(B.offd[i][6]) * C.offd[i][10] + // + conj(B.offd[i][7]) * C.offd[i][11] + // + conj(B.offd[i][8]) * C.offd[i][12] + // + conj(B.offd[i][9]) * C.offd[i][13] + // + B.diag[i][4] * C.offd[i][14] + // + B.offd[i][14] * C.diag[i][5]; + + } + + //! Computes Tr(A) + template + RScalarREG > tri6_trace( QDP::PCompREG>>>& ADia, + QDP::PCompREG>>>& AOff, + int comp){ + RScalarREG > tr = 0.0; + for (size_t i = 0; i < 2*Nc; i++) + tr += ADia.elem(comp).elem(i); + return tr; + } + + // Computes Tr(A*B) using hermiticity of the matrices + template + RScalarREG > tri6_trace_mul( QDP::PCompREG>>>& ADia, + QDP::PCompREG>>>& AOff, + QDP::PCompREG>>>& BDia, + QDP::PCompREG>>>& BOff, + int comp){ + RScalarREG > trd = 0.0, tro = 0.0; + for (size_t i = 0; i < 2*Nc; i++) + trd += ADia.elem(comp).elem(i) * BDia.elem(comp).elem(i); + + for (size_t i = 0; i < 2*Nc*Nc-Nc; i++) + tro += real(AOff.elem(comp).elem(i)) * real(BOff.elem(comp).elem(i)) + + imag(AOff.elem(comp).elem(i)) * imag(BOff.elem(comp).elem(i)); + return trd + tro + tro; + } + + template + void exponentiate(QDP::PCompJIT>>>& ADia_Jit, + QDP::PCompJIT>>>& AOff_Jit, + int sign){ + const int Niter = 17; + QDP::PCompREG>>> ADia, A2Dia, A3Dia, tmpDia; + QDP::PCompREG>>> AOff, A2Off, A3Off, tmpOff; + + for(size_t comp = 0; comp < 2; comp++){ + for(size_t i = 0; i < 2*Nc; i++){ + ADia.elem(comp).elem(i) = ADia_Jit.elem(comp).elem(i); + } + for(size_t i = 0; i < 2*Nc*Nc-Nc; i++){ + AOff.elem(comp).elem(i) = AOff_Jit.elem(comp).elem(i); + } + } + + for(size_t comp = 0; comp < 2; comp++){ + + tri6_mult(A2Dia, A2Off, ADia, AOff, ADia, AOff, comp); + tri6_mult(A3Dia, A3Off, ADia, AOff, A2Dia, A2Off, comp); + + RScalarREG> tr[7]; + tr[0] = 0; + tr[1] = 0; + tr[2] = tri6_trace(A2Dia, A2Off, comp); + tr[3] = tri6_trace(A3Dia, A3Off, comp); + tr[4] = tri6_trace_mul(A2Dia, A2Off, A2Dia, A2Off, comp); + tr[5] = tri6_trace_mul(A2Dia, A2Off, A3Dia, A3Off, comp); + tr[6] = tri6_trace_mul(A3Dia, A3Off, A3Dia, A3Off, comp); + + RScalarREG> p[5]; + p[0] = RScalarREG >(-1.0/6.0)*tr[6] + RScalarREG >(1.0/18.0)*tr[3]*tr[3] + - RScalarREG >(1.0/48.0)*tr[2]*tr[2]*tr[2] + RScalarREG >(1.0/8.0)*tr[4]*tr[2]; + p[1] = RScalarREG >(-1.0/5.0)*tr[5] + RScalarREG >(1.0/6.0) *tr[2]*tr[3]; + p[2] = RScalarREG >(-1.0/4.0)*tr[4] + RScalarREG >(1.0/8.0) *tr[2]*tr[2]; + p[3] = RScalarREG >(-1.0/3.0)*tr[3]; + p[4] = RScalarREG >(-1.0/2.0)*tr[2]; + + RScalarREG> c[Niter+1]; + c[0] = 1; + for (size_t i = 1; i < Niter+1; i++) + c[i] = c[i - 1] / RScalarREG >(i); + if (sign == 1) + for (size_t i = 1; i < Niter+1; i += 2) + c[i] = -c[i]; + + RScalarREG> q[6]; + if (Niter > 5){ + int ic = Niter - 6; + for (size_t i = 0; i < 6; i++) + q[i] = c[ic+1+i]; + + while (ic >= 0) { + RScalarREG > q5 = q[5]; + q[5] = q[4]; + q[4] = q[3] - q5 * p[4]; + q[3] = q[2] - q5 * p[3]; + q[2] = q[1] - q5 * p[2]; + q[1] = q[0] - q5 * p[1]; + q[0] = c[ic] - q5 * p[0]; + ic -= 1; + } + } + else { + QDPIO::cerr << "error" << std::endl; + } + + // I*q0 + A*q1 + A^2*q2 + A^3*q3 + for (size_t i = 0; i < 2*Nc; i++) + ADia.elem(comp).elem(i) = q[0] + + ADia.elem(comp).elem(i)*q[1] + + A2Dia.elem(comp).elem(i)*q[2] + + A3Dia.elem(comp).elem(i)*q[3]; + // A.diag[comp][i] = q[0] + A.diag[comp][i]*q[1] + A2.diag[comp][i]*q[2] + A3.diag[comp][i]*q[3]; + for (size_t i = 0; i < 2*Nc*Nc-Nc; i++) + AOff.elem(comp).elem(i) = AOff.elem(comp).elem(i)*q[1] + + A2Off.elem(comp).elem(i)*q[2] + + A3Off.elem(comp).elem(i)*q[3]; + // A.offd[comp][i] = A.offd[comp][i]*q[1] + A2.offd[comp][i]*q[2] + A3.offd[comp][i]*q[3]; + + // A^4*q4 + tri6_mult(tmpDia, tmpOff, A2Dia, A2Off, A2Dia, A2Off, comp); + for (size_t i = 0; i < 2*Nc; i++) + ADia.elem(comp).elem(i) += tmpDia.elem(comp).elem(i)*q[4]; + // A.diag[comp][i] += tmp.diag[comp][i]*q[4]; + for (size_t i = 0; i < 2*Nc*Nc-Nc; i++) + AOff.elem(comp).elem(i) += tmpOff.elem(comp).elem(i)*q[4]; + // A.offd[comp][i] += tmp.offd[comp][i]*q[4]; + + // A^5*q5 + tri6_mult(tmpDia, tmpOff, A3Dia, A3Off, A2Dia, A2Off, comp); + for (size_t i = 0; i < 2*Nc; i++) + ADia.elem(comp).elem(i) += tmpDia.elem(comp).elem(i)*q[5]; + // A.diag[comp][i] += tmp.diag[comp][i]*q[5]; + for (size_t i = 0; i < 2*Nc*Nc-Nc; i++) + AOff.elem(comp).elem(i) += tmpOff.elem(comp).elem(i)*q[5]; + // A.offd[comp][i] += tmp.offd[comp][i]*q[5]; + + + } + + for(size_t comp = 0; comp < 2; comp++){ + for(size_t i = 0; i < 2*Nc; i++){ + ADia_Jit.elem(comp).elem(i) = ADia.elem(comp).elem(i); + } + for(size_t i = 0; i < 2*Nc*Nc-Nc; i++){ + AOff_Jit.elem(comp).elem(i) = AOff.elem(comp).elem(i); + } + } + } +} + +#endif diff --git a/lib/actions/ferm/linop/clover_term_jit_w.h b/lib/actions/ferm/linop/clover_term_jit_w.h index d62ebb3ee..fbc0a4bf9 100644 --- a/lib/actions/ferm/linop/clover_term_jit_w.h +++ b/lib/actions/ferm/linop/clover_term_jit_w.h @@ -6,11 +6,12 @@ #ifndef __clover_term_jit_w_h__ #define __clover_term_jit_w_h__ -//#warning "Using QDP-JIT clover term" +#warning "Using QDP-JIT clover term" #include "state.h" #include "actions/ferm/fermacts/clover_fermact_params_w.h" #include "actions/ferm/linop/clover_term_base_w.h" +#include "actions/ferm/linop/clover_term_jit_stabilized_helpers.h" #include "meas/glue/mesfield.h" @@ -421,7 +422,7 @@ namespace Chroma * \param f field strength tensor F(mu,nu) (Read) * \param cb checkerboard (Read) */ - void makeClov(const multi1d& f, const RealT& diag_mass); + void makeClov(const multi1d& f, const RealT& diag_mass, const bool stabilized); //! Invert the clover term on cb //void chlclovms(LatticeREAL& log_diag, int cb); @@ -556,7 +557,7 @@ namespace Chroma /* Calculate F(mu,nu) */ multi1d f; mesField(f, u); - makeClov(f, diag_mass); + makeClov(f, diag_mass, param.stabilized_wilson); choles_done.resize(rb.numSubsets()); for(int i=0; i < rb.numSubsets(); i++) { @@ -658,6 +659,7 @@ namespace Chroma const U& f3, const U& f4, const U& f5, + const bool stabilized, X& tri_dia, Y& tri_off) { @@ -703,6 +705,7 @@ namespace Chroma const U& f3, const U& f4, const U& f5, + const bool stabilized, const X& tri_dia, const Y& tri_off) { @@ -754,12 +757,20 @@ namespace Chroma diag_mass_reg.setup_value( diag_mass_jit.elem() ); - for(int jj = 0; jj < 2; jj++) { - for(int ii = 0; ii < 2*Nc; ii++) { - tri_dia_j.elem(jj).elem(ii) = diag_mass_reg.elem().elem(); - //tri[site].diag[jj][ii] = diag_mass.elem().elem().elem(); + if(stabilized == true){ + for(int jj = 0; jj < 2; jj++) { + for(int ii = 0; ii < 2*Nc; ii++) { + zero_rep(tri_dia_j.elem(jj).elem(ii)); + } } } + else{ + for(int jj = 0; jj < 2; jj++) { + for(int ii = 0; ii < 2*Nc; ii++) { + tri_dia_j.elem(jj).elem(ii) = diag_mass_reg.elem().elem(); + } + } + } RComplexREG > E_minus; @@ -831,6 +842,22 @@ namespace Chroma } } + if(stabilized == true){ + exponentiate(tri_dia_j, tri_off_j, 0); + + // fix constants here + for(int jj = 0; jj < 2; jj++) { + for(int ii = 0; ii < 6; ii++) { + tri_dia_j.elem(jj).elem(ii) *= diag_mass_reg.elem().elem(); + } + } + for(int jj = 0; jj < 2; jj++) { + for(int ii = 0; ii < 15; ii++) { + tri_off_j.elem(jj).elem(ii) *= diag_mass_reg.elem().elem(); + } + } + } + // std::cout << __PRETTY_FUNCTION__ << ": leaving\n"; jit_get_function(function); @@ -841,7 +868,7 @@ namespace Chroma /* This now just sets up and dispatches... */ template - void JITCloverTermT::makeClov(const multi1d& f, const RealT& diag_mass) + void JITCloverTermT::makeClov(const multi1d& f, const RealT& diag_mass, const bool stabilized) { START_CODE(); @@ -862,16 +889,25 @@ namespace Chroma U f4 = f[4] * getCloverCoeff(1,3); U f5 = f[5] * getCloverCoeff(2,3); + if (stabilized == true){ + f0 /= diag_mass; + f1 /= diag_mass; + f2 /= diag_mass; + f3 /= diag_mass; + f4 /= diag_mass; + f5 /= diag_mass; + QDPIO::cout << "\n\nUsing expo clover" << std::endl; + } //QDPIO::cout << "PTX Clover make " << (void*)this << "\n"; //std::cout << "PTX Clover make " << (void*)this << "\n"; static JitFunction function; if (function.empty()) - function_make_clov_build(function, diag_mass, f0,f1,f2,f3,f4,f5, tri_dia , tri_off ); + function_make_clov_build(function, diag_mass, f0,f1,f2,f3,f4,f5, stabilized, tri_dia , tri_off ); // Execute the function - function_make_clov_exec(function, diag_mass, f0,f1,f2,f3,f4,f5,tri_dia, tri_off); + function_make_clov_exec(function, diag_mass, f0,f1,f2,f3,f4,f5,stabilized,tri_dia, tri_off); END_CODE(); }