From 4407225c60ed89a546a8bf15d1c597092b75e546 Mon Sep 17 00:00:00 2001 From: mitsuaki1987 Date: Fri, 12 Apr 2019 10:53:32 +0900 Subject: [PATCH] Implement CMA --- src/CMakeLists.txt | 2 +- src/StdFace/ChainLattice.c | 8 +- src/mltplyMPIBoost.c | 429 +++++++++++++++++++++++++++++++++++++ src/mltplyMPIBoost.h | 31 +++ src/mltplySpin.c | 52 +++++ 5 files changed, 517 insertions(+), 5 deletions(-) create mode 100644 src/mltplyMPIBoost.c create mode 100644 src/mltplyMPIBoost.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c74b7ac6c..61745c769 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,7 +12,7 @@ add_definitions(-DDSFMT_MEXP=19937) set(SOURCES FileIO.c HPhiMain.c HPhiTrans.c bitcalc.c check.c CheckMPI.c dSFMT.c diagonalcalc.c global.c log.c input.c output.c output_list.c readdef.c sz.c xsetmem.c ErrorMessage.c LogMessage.c ProgressMessage.c wrapperMPI.c splash.c time.c eigenIO.c) -set(SOURCES_MLTPLY Multiply.c mltply.c mltplySpin.c mltplyHubbard.c mltplyMPIHubbard.c mltplyMPISpin.c mltplyHubbardCore.c mltplySpinCore.c mltplyMPIHubbardCore.c mltplyMPISpinCore.c) +set(SOURCES_MLTPLY Multiply.c mltply.c mltplySpin.c mltplyHubbard.c mltplyMPIHubbard.c mltplyMPISpin.c mltplyHubbardCore.c mltplySpinCore.c mltplyMPIHubbardCore.c mltplyMPISpinCore.c mltplyMPIBoost.c) set(SOURCES_CG CalcByLOBPCG.c ) set(SOURCES_TPQ CalcByTPQ.c FirstMultiply.c) set(SOURCES_DIAG CalcByFullDiag.c lapack_diag.c matrixlapack.c matrixlapack_magma.c matrixscalapack.c ) diff --git a/src/StdFace/ChainLattice.c b/src/StdFace/ChainLattice.c index 0d2730258..63a3e13e8 100644 --- a/src/StdFace/ChainLattice.c +++ b/src/StdFace/ChainLattice.c @@ -92,7 +92,7 @@ void StdFace_Chain( StdFace_PrintVal_d("D", &StdI->D[2][2], 0.0); StdFace_InputSpinNN(StdI->J, StdI->JAll, StdI->J0, StdI->J0All, "J0"); StdFace_InputSpinNN(StdI->Jp, StdI->JpAll, StdI->J0p, StdI->J0pAll, "J0'"); - StdFace_InputSpinNN(StdI->Jpp, StdI->JppAll, StdI->J0pp, StdI->J0ppAll, "J0'"); + StdFace_InputSpinNN(StdI->Jpp, StdI->JppAll, StdI->J0pp, StdI->J0ppAll, "J0''"); /**/ StdFace_NotUsed_d("mu", StdI->mu); StdFace_NotUsed_d("U", StdI->U); @@ -261,11 +261,11 @@ void StdFace_Chain_Boost(struct StdIntList *StdI) 0.25 * StdI->J0[2][0], 0.25 * StdI->J0[2][1], 0.25 * StdI->J0[2][2]); fprintf(fp, "# J 2\n"); fprintf(fp, "%25.15e %25.15e %25.15e\n", - 0.25 * StdI->Jp[0][0], 0.25 * StdI->Jp[0][1], 0.25 * StdI->Jp[0][2]); + 0.25 * StdI->J0p[0][0], 0.25 * StdI->J0p[0][1], 0.25 * StdI->J0p[0][2]); fprintf(fp, "%25.15e %25.15e %25.15e\n", - 0.25 * StdI->Jp[1][0], 0.25 * StdI->Jp[1][1], 0.25 * StdI->Jp[1][2]); + 0.25 * StdI->J0p[1][0], 0.25 * StdI->J0p[1][1], 0.25 * StdI->J0p[1][2]); fprintf(fp, "%25.15e %25.15e %25.15e\n", - 0.25 * StdI->Jp[2][0], 0.25 * StdI->Jp[2][1], 0.25 * StdI->Jp[2][2]); + 0.25 * StdI->J0p[2][0], 0.25 * StdI->J0p[2][1], 0.25 * StdI->J0p[2][2]); /* Topology */ diff --git a/src/mltplyMPIBoost.c b/src/mltplyMPIBoost.c new file mode 100644 index 000000000..51c63ce67 --- /dev/null +++ b/src/mltplyMPIBoost.c @@ -0,0 +1,429 @@ +/* HPhi - Quantum Lattice Model Simulator */ +/* Copyright (C) 2015 The University of Tokyo */ + +/* This program is free software: you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation, either version 3 of the License, or */ +/* (at your option) any later version. */ + +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ + +/* You should have received a copy of the GNU General Public License */ +/* along with this program. If not, see . */ + +//Define Mode for mltply +// complex version + +#ifdef MPI +#include "mpi.h" +#endif +#include "Common.h" +#include "common/setmemory.h" +#include "wrapperMPI.h" + +void zgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double complex *ALPHA, double complex *matJL, int *LDA, double complex *arrayz, int *LDB, double complex *BETA, double complex *arrayx, int *LDC); + +/** + * + * Exchange term in Spin model + * + * @author Mitsuaki Kawamura (The University of Tokyo) + * @author Youhei Yamaji (The University of Tokyo) + */ +void child_general_int_spin_MPIBoost( + struct BindStruct *X /**< [inout]*/, + double complex *tmp_v0 /**< [out] Result v0 = H v1*/, + double complex *tmp_v1 /**< [in] v0 = H v1*/, + double complex *tmp_v2 /**< [inout] bufffer*/, + double complex *tmp_v3 /**< [inout] bufffer*/ + ) +{ +#ifdef MPI + + //double complex dam_pr = 0; + // MPI_Status statusMPI; + + // int ierr; + // int INFO; + char TRANSA, TRANSB; + int M, N, K, LDA, LDB, LDC; + double complex ALPHA, BETA; + long unsigned int i_max; + long unsigned int j, k, ell, iloop; + long unsigned int i1, i2; + long unsigned int iomp; + long unsigned int ell4, ell5, ell6, m0, Ipart1; + long unsigned int mi, mj, mri, mrj, mrk, mrl; + int indj; + long unsigned int ellrl, ellrk, ellrj, ellri, elli1, elli2, ellj1, ellj2; + long unsigned int iSS1, iSS2, iSSL1, iSSL2; + double complex **vecJ; + double complex **matJ, **matJ2; + double complex *matJL; + double complex *matI; + double complex **matB; + double complex *arrayz; + double complex *arrayx; + double complex *arrayw; + long unsigned int ishift1, ishift2, ishift3, ishift4, ishift5, pivot_flag, num_J_star; + long unsigned int pow4, pow5, pow41, pow51; + //long unsigned int pow1, pow2, pow3, pow4, pow5, pow11, pow21, pow31, pow41, pow51; + + i_max = X->Check.idim_max; + +/* +//zero clear + #pragma omp parallel for default(none) private(j) \ + shared(i_max,tmp_v0) + for(j=0;jBoost.W0, X->Boost.R0, X->Boost.num_pivot, X->Boost.ishift_nspin, X->Boost.list_6spin_star, X->Boost.list_6spin_pair, 1, X->Boost.arrayJ, X->Boost.vecB); + + for(iloop=0; iloop < X->Boost.R0; iloop++){ + + + for(j=iloop*X->Boost.num_pivot; j < (iloop+1)*X->Boost.num_pivot; j++){ + + num_J_star = (long unsigned int)X->Boost.list_6spin_star[j][0]; //(0,j) + ishift1 = (long unsigned int)X->Boost.list_6spin_star[j][1]; //(1,j) + ishift2 = (long unsigned int)X->Boost.list_6spin_star[j][2]; //(2,j) + ishift3 = (long unsigned int)X->Boost.list_6spin_star[j][3]; //(3,j) + ishift4 = (long unsigned int)X->Boost.list_6spin_star[j][4]; //(4,j) + ishift5 = (long unsigned int)X->Boost.list_6spin_star[j][5]; //(5,j) + pivot_flag = (long unsigned int)X->Boost.list_6spin_star[j][6]; //(6,j) + //pow1 = (int)pow(2.0,ishift1); + //pow2 = (int)pow(2.0,ishift1+ishift2); + //pow3 = (int)pow(2.0,ishift1+ishift2+ishift3); + pow4 = (int)pow(2.0,ishift1+ishift2+ishift3+ishift4); + pow5 = (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5); + //pow11= (int)pow(2.0,ishift1+1); + //pow21= (int)pow(2.0,ishift1+ishift2+1); + //pow31= (int)pow(2.0,ishift1+ishift2+ishift3+1); + pow41= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+1); + pow51= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+1); + + for(k=0; k < (64*64); k++){ + matJL[k] = 0.0 + 0.0*I; + matI[k] = 0.0 + 0.0*I; + } + for(k=0; k < 64; k++){ + matI[k+64*k] = 1.0; + } + + for(ell=0; ell < num_J_star; ell++){ + mi = (long unsigned int)X->Boost.list_6spin_pair[j][0][ell]; //(1,ell,j) + mj = (long unsigned int)X->Boost.list_6spin_pair[j][1][ell]; //(2,ell,j) + mri = (long unsigned int)X->Boost.list_6spin_pair[j][2][ell]; //(3,ell,j) + mrj = (long unsigned int)X->Boost.list_6spin_pair[j][3][ell]; //(4,ell,j) + mrk = (long unsigned int)X->Boost.list_6spin_pair[j][4][ell]; //(5,ell,j) + mrl = (long unsigned int)X->Boost.list_6spin_pair[j][5][ell]; //(6,ell,j) + indj = X->Boost.list_6spin_pair[j][6][ell]; //(7,ell,j) + for(i1 = 0; i1 < 3; i1++){ + for(i2 = 0; i2 < 3; i2++){ + vecJ[i1][i2] = X->Boost.arrayJ[(indj-1)][i1][i2]; + } + } + //matJSS(1,1) = vecJ(3,3) + matJ[0][0] = vecJ[2][2]; + //matJSS(1,2)= vecJ(1,1)-vecJ(2,2)-dcmplx(0.0d0,1.0d0)*vecJ(1,2)-dcmplx(0.0d0,1.0d0)*vecJ(2,1) + matJ[0][1] = vecJ[0][0]-vecJ[1][1]-I*vecJ[0][1]-I*vecJ[1][0]; + //matJSS(1,3)= vecJ(3,1)-dcmplx(0.0d0,1.0d0)*vecJ(3,2) + matJ[0][2] = vecJ[2][0]-I*vecJ[2][1]; + //matJSS(1,4)= vecJ(1,3)-dcmplx(0.0d0,1.0d0)*vecJ(2,3) + matJ[0][3] = vecJ[0][2]-I*vecJ[1][2]; + //matJSS(2,1)= vecJ(1,1)-vecJ(2,2)+dcmplx(0.0d0,1.0d0)*vecJ(1,2)+dcmplx(0.0d0,1.0d0)*vecJ(2,1) + matJ[1][0] = vecJ[0][0]-vecJ[1][1]+I*vecJ[0][1]+I*vecJ[1][0]; + //matJSS(2,2)= vecJ(3,3) + matJ[1][1] = vecJ[2][2]; + //matJSS(2,3)=dcmplx(-1.0d0,0.0d0)*vecJ(1,3)-dcmplx(0.0d0,1.0d0)*vecJ(2,3) + matJ[1][2] =(-1.0)*vecJ[0][2]-I*vecJ[1][2]; + //matJSS(2,4)=dcmplx(-1.0d0,0.0d0)*vecJ(3,1)-dcmplx(0.0d0,1.0d0)*vecJ(3,2) + matJ[1][3] =(-1.0)*vecJ[2][0]-I*vecJ[2][1]; + //matJSS(3,1)= vecJ(3,1)+dcmplx(0.0d0,1.0d0)*vecJ(3,2) + matJ[2][0] = vecJ[2][0]+I*vecJ[2][1]; + //matJSS(3,2)=dcmplx(-1.0d0,0.0d0)*vecJ(1,3)+dcmplx(0.0d0,1.0d0)*vecJ(2,3) + matJ[2][1] =(-1.0)*vecJ[0][2]+I*vecJ[1][2]; + //matJSS(3,3)=dcmplx(-1.0d0,0.0d0)*vecJ(3,3) + matJ[2][2] =(-1.0)*vecJ[2][2]; + //matJSS(3,4)= vecJ(1,1)+vecJ(2,2)+dcmplx(0.0d0,1.0d0)*vecJ(1,2)-dcmplx(0.0d0,1.0d0)*vecJ(2,1) + matJ[2][3] = vecJ[0][0]+vecJ[1][1]+I*vecJ[0][1]-I*vecJ[1][0]; + //matJSS(4,1)= vecJ(1,3)+dcmplx(0.0d0,1.0d0)*vecJ(2,3) + matJ[3][0] = vecJ[0][2]+I*vecJ[1][2]; + //matJSS(4,2)=dcmplx(-1.0d0,0.0d0)*vecJ(3,1)+dcmplx(0.0d0,1.0d0)*vecJ(3,2) + matJ[3][1] =(-1.0)*vecJ[2][0]+I*vecJ[2][1]; + //matJSS(4,3)= vecJ(1,1)+vecJ(2,2)-dcmplx(0.0d0,1.0d0)*vecJ(1,2)+dcmplx(0.0d0,1.0d0)*vecJ(2,1) + matJ[3][2] = vecJ[0][0]+vecJ[1][1]-I*vecJ[0][1]+I*vecJ[1][0]; + //matJSS(4,4)=dcmplx(-1.0d0,0.0d0)*vecJ(3,3) + matJ[3][3] =(-1.0)*vecJ[2][2]; + + matJ2[3][3] = matJ[0][0]; + matJ2[3][0] = matJ[0][1]; + matJ2[3][1] = matJ[0][2]; + matJ2[3][2] = matJ[0][3]; + matJ2[0][3] = matJ[1][0]; + matJ2[0][0] = matJ[1][1]; + matJ2[0][1] = matJ[1][2]; + matJ2[0][2] = matJ[1][3]; + matJ2[1][3] = matJ[2][0]; + matJ2[1][0] = matJ[2][1]; + matJ2[1][1] = matJ[2][2]; + matJ2[1][2] = matJ[2][3]; + matJ2[2][3] = matJ[3][0]; + matJ2[2][0] = matJ[3][1]; + matJ2[2][1] = matJ[3][2]; + matJ2[2][2] = matJ[3][3]; + + for(ellri=0; ellri<2; ellri++){ + for(ellrj=0; ellrj<2; ellrj++){ + for(ellrk=0; ellrk<2; ellrk++){ + for(ellrl=0; ellrl<2; ellrl++){ + for(elli1=0; elli1<2; elli1++){ + for(ellj1=0; ellj1<2; ellj1++){ + for(elli2=0; elli2<2; elli2++){ + for(ellj2=0; ellj2<2; ellj2++){ + + iSSL1 = elli1*(int)pow(2,mi) + ellj1*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl); + iSSL2 = elli2*(int)pow(2,mi) + ellj2*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl); + iSS1 = elli1 + 2*ellj1; + iSS2 = elli2 + 2*ellj2; + matJL[iSSL1+64*iSSL2] += matJ2[iSS1][iSS2]; + } + } + } + } + } + } + } + } + + + }/* loop for ell */ + + /* external magnetic field B */ + if(pivot_flag==1){ + matB[0][0] = + X->Boost.vecB[2]; // -BM + matB[1][1] = - X->Boost.vecB[2]; // -BM + //matB[0][1] = - X->Boost.vecB[0] + I*X->Boost.vecB[1]; // -BM + //matB[1][0] = - X->Boost.vecB[0] - I*X->Boost.vecB[1]; // -BM + matB[0][1] = - X->Boost.vecB[0] - I*X->Boost.vecB[1]; // -BM + matB[1][0] = - X->Boost.vecB[0] + I*X->Boost.vecB[1]; // -BM + for(ellri=0; ellri<2; ellri++){ + for(ellrj=0; ellrj<2; ellrj++){ + for(ellrk=0; ellrk<2; ellrk++){ + for(ellrl=0; ellrl<2; ellrl++){ + for(ellj1=0; ellj1<2; ellj1++){ + for(elli1=0; elli1<2; elli1++){ + for(elli2=0; elli2<2; elli2++){ + for(ellj2=0; ellj2Boost.ishift_nspin; ellj2++){ + iSSL1 = elli1*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6)); + iSSL2 = elli2*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6)); + matJL[iSSL1+64*iSSL2] += matB[elli1][elli2]; + } + } + } + } + } + } + } + } + } + /* external magnetic field B */ + + iomp=i_max/(int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+2); + + #pragma omp parallel default(none) private(arrayx,arrayz,arrayw,ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \ + shared(matJL,matI,iomp,i_max,myrank,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,tmp_v0,tmp_v1,tmp_v3) + { + + arrayx = cd_1d_allocate(64*((int)pow(2.0,ishift4+ishift5-1))); + arrayz = cd_1d_allocate(64*((int)pow(2.0,ishift4+ishift5-1))); + arrayw = cd_1d_allocate(64*((int)pow(2.0,ishift4+ishift5-1))); + +#pragma omp for + for(ell6 = 0; ell6 < iomp; ell6++){ + Ipart1=pow51*2*ell6; + for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){ + for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){ + for(m0 = 0; m0 < 16; m0++){ + arrayz[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)]; + arrayz[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)]; + arrayz[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)]; + arrayz[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]; + tmp_v3[(1 + m0+16*ell4 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)]; + tmp_v3[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)]; + tmp_v3[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)]; + tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]; + arrayx[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4 +pow41*ell5+Ipart1)]; + arrayx[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)]; + arrayx[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)]; + arrayx[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]; + } + } + } + + + for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){ + for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){ + for(m0 = 0; m0 < 16; m0++){ + arrayz[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)]; + arrayz[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)]; + arrayz[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)]; + arrayz[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)]; + tmp_v3[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)]; + tmp_v3[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)]; + tmp_v3[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)]; + tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)]; + arrayx[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)]; + arrayx[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)]; + arrayx[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)]; + arrayx[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)]; + } + + } + } + + TRANSA = 'N'; + TRANSB = 'N'; + M = 64; + N = (int)pow(2.0, ishift4+ishift5-1); + K = 64; + ALPHA = 1.0; + LDA = 64; + LDB = 64; + BETA = 1.0; + LDC = 64; + + zgemm_(&TRANSA,&TRANSB,&M,&N,&K,&ALPHA,matJL,&LDA,arrayz,&LDB,&BETA,arrayx,&LDC); + //zgemm_(&TRANSA,&TRANSB,&M,&N,&K,&ALPHA,matI,&LDA,arrayz,&LDB,&BETA,arrayx,&LDC); +/* + for(ell5=0;ell5<(64*N);ell5++){ + arrayw[ell5]=0.0; + } + for(ell5=0;ell5<64;ell5++){ + for(ell4=0;ell4<64;ell4++){ + for(m0=0;m0Boost.ishift_nspin); + #pragma omp parallel for default(none) private(ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \ + firstprivate(iomp) shared(i_max,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,X,tmp_v0,tmp_v1) + for(ell5 = 0; ell5 < iomp; ell5++ ){ + for(ell4 = 0; ell4 < (int)pow(2.0,X->Boost.ishift_nspin); ell4++){ + tmp_v0[(1 + ell5+(i_max/(int)pow(2.0,X->Boost.ishift_nspin))*ell4)] = tmp_v1[(1 + ell4+((int)pow(2.0,X->Boost.ishift_nspin))*ell5)]; + } + } + iomp=i_max/(int)pow(2.0,X->Boost.ishift_nspin); + #pragma omp parallel for default(none) private(ell4,ell5) \ + firstprivate(iomp) shared(i_max,X,tmp_v1,tmp_v3) + for(ell5 = 0; ell5 < iomp; ell5++ ){ + for(ell4 = 0; ell4 < (int)pow(2.0,X->Boost.ishift_nspin); ell4++){ + tmp_v1[(1 + ell5+(i_max/(int)pow(2.0,X->Boost.ishift_nspin))*ell4)] = tmp_v3[(1 + ell4+((int)pow(2.0,X->Boost.ishift_nspin))*ell5)]; + } + } + } + else{ + #pragma omp parallel for default(none) private(ell4) \ + shared(i_max,tmp_v0,tmp_v1,tmp_v3) + for(ell4 = 0; ell4 < i_max; ell4++ ){ + tmp_v0[1 + ell4] = tmp_v1[1 + ell4]; + tmp_v1[1 + ell4] = tmp_v3[1 + ell4]; + } + }/* if pivot_flag */ + + }/* loop for j */ + + /* + ierr = MPI_Alltoall(&tmp_v1[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,&tmp_v3[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD); + ierr = MPI_Alltoall(&tmp_v0[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,&tmp_v2[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD); + */ + MPI_Alltoall(&tmp_v1[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,&tmp_v3[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD); + MPI_Alltoall(&tmp_v0[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,&tmp_v2[1],(int)(i_max/nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD); + + + iomp=(int)pow(2.0,X->Boost.W0)/nproc; + #pragma omp parallel for default(none) private(ell4,ell5,ell6) \ + firstprivate(iomp) shared(i_max,X,nproc,tmp_v0,tmp_v1,tmp_v2,tmp_v3) + //for(ell4 = 0; ell4 < (int)pow(2.0,X->Boost.W0)/nproc; ell4++ ){ + for(ell4 = 0; ell4 < iomp; ell4++ ){ + for(ell5 = 0; ell5 < nproc; ell5++ ){ + for(ell6 = 0; ell6 < (int)(i_max/(int)pow(2.0,X->Boost.W0)); ell6++ ){ + tmp_v1[(1 + ell6+ell5*i_max/(int)pow(2.0,X->Boost.W0)+ell4*i_max/((int)pow(2.0,X->Boost.W0)/nproc))] = tmp_v3[(1 + ell6+ell4*i_max/(int)pow(2.0,X->Boost.W0)+ell5*i_max/nproc)]; + tmp_v0[(1 + ell6+ell5*i_max/(int)pow(2.0,X->Boost.W0)+ell4*i_max/((int)pow(2.0,X->Boost.W0)/nproc))] = tmp_v2[(1 + ell6+ell4*i_max/(int)pow(2.0,X->Boost.W0)+ell5*i_max/nproc)]; + } + } + } + + + }/* loop for iloop */ + +/* + dam_pr= X_child_general_int_spin_MPIBoost + ( + matJ, X, tmp_v0, tmp_v1); + + X->Large.prdct += dam_pr; +*/ +// c_free1(arrayz, (int)pow(2.0, 16)); +// c_free1(arrayx, (int)pow(2.0, 16)); +// c_free1(arrayw, (int)pow(2.0, 16)); + + free_cd_2d_allocate(vecJ); + free_cd_2d_allocate(matJ); + free_cd_2d_allocate(matJ2); + free_cd_2d_allocate(matB); + free_cd_1d_allocate(matJL); + free_cd_1d_allocate(matI); +#endif + +}/*void child_general_int_spin_MPIBoost*/ + diff --git a/src/mltplyMPIBoost.h b/src/mltplyMPIBoost.h new file mode 100644 index 000000000..82403308c --- /dev/null +++ b/src/mltplyMPIBoost.h @@ -0,0 +1,31 @@ +/* HPhi - Quantum Lattice Model Simulator */ +/* Copyright (C) 2015 The University of Tokyo */ + +/* This program is free software: you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation, either version 3 of the License, or */ +/* (at your option) any later version. */ + +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ + +/* You should have received a copy of the GNU General Public License */ +/* along with this program. If not, see . */ + +//Define Mode for mltply +// complex version + +#pragma once +#include +#include "struct.h" + +void child_general_int_spin_MPIBoost +( + struct BindStruct *X, + double complex *tmp_v0, + double complex *tmp_v1, + double complex *tmp_v2, + double complex *tmp_v3 +); diff --git a/src/mltplySpin.c b/src/mltplySpin.c index 1a28b84ba..10d520111 100644 --- a/src/mltplySpin.c +++ b/src/mltplySpin.c @@ -165,6 +165,7 @@ General on-site term #include "mltplyHubbardCore.h" #include "mltplyMPISpin.h" #include "mltplyMPISpinCore.h" +#include "mltplyMPIBoost.h" /** @brief Driver function for Spin hamiltonian @return error code @@ -374,6 +375,54 @@ shared(tmp_v0,tmp_v1,list_1,list_2_1,list_2_2,one,nstate) return 0; }/*int mltplyGeneralSpin*/ /** +@brief Driver function for Spin hamiltonian (Boost) +@return error code +@author Kazuyoshi Yoshimi (The University of Tokyo) +*/ +int mltplySpinGCBoost( + struct BindStruct *X,//!<[inout] + int nstate, + double complex **tmp_v0,//!<[inout] Result vector + double complex **tmp_v1//!<[in] Input producted vector +) +{ + int istate; + /* SpinGCBoost */ + double complex* tmp_v2, *tmp_tmp_v0, *tmp_tmp_v1; + double complex* tmp_v3; + /* SpinGCBoost */ + + long unsigned int i_max, idim; + i_max = X->Check.idim_max; + + StartTimer(500); + tmp_tmp_v0 = cd_1d_allocate(i_max + 1); + tmp_tmp_v1 = cd_1d_allocate(i_max + 1); + tmp_v2 = cd_1d_allocate(i_max + 1); + tmp_v3 = cd_1d_allocate(i_max + 1); + + for (istate = 0; istate < nstate; istate++) { + for (idim = 1; idim <= i_max; idim++) { + tmp_tmp_v0[idim] = tmp_v0[idim][istate]; + tmp_tmp_v1[idim] = tmp_v1[idim][istate]; + } + child_general_int_spin_MPIBoost(X, tmp_tmp_v0, tmp_tmp_v1, tmp_v2, tmp_v3); + for (idim = 1; idim <= i_max; idim++) { + tmp_v0[idim][istate] = tmp_tmp_v0[idim]; + tmp_v1[idim][istate] = tmp_tmp_v1[idim]; + } + } + + /* SpinGCBoost */ + free_cd_1d_allocate(tmp_tmp_v0); + free_cd_1d_allocate(tmp_tmp_v1); + free_cd_1d_allocate(tmp_v2); + free_cd_1d_allocate(tmp_v3); + /* SpinGCBoost */ + StopTimer(500); + return 0; +}/*int mltplySpinGCBoost*/ +/** @brief Driver function for Spin hamiltonian @return error code @author Kazuyoshi Yoshimi (The University of Tokyo) @@ -391,6 +440,9 @@ int mltplySpinGC( if(iret != 0) return iret; + if (X->Boost.flgBoost == 1) + iret = mltplySpinGCBoost(X, nstate, tmp_v0, tmp_v1); + return iret; }/*int mltplySpinGC*/ /**