diff --git a/Make_template_combos b/Make_template_combos index ed9aa3181..aefa3b1fd 100644 --- a/Make_template_combos +++ b/Make_template_combos @@ -172,7 +172,7 @@ endif FLINKS_HISQ_MILC_GPU = fermion_links_hisq_milc.o \ fermion_links_fn_load_gpu.o \ - fermion_links_hisq_load_milc.o fermion_links_hisq_load_gpu.o \ + fermion_links_hisq_load_milc.o \ ${FLINKS} ks_action_paths_hisq.o su3_mat_op.o stout_smear.o # Standard QOP combinations @@ -206,7 +206,12 @@ endif # Standard MILC # Choices here are dslash_fn.o dslash_fn2.o dslash_fn_dblstore.o +ifeq ($(strip ${WANTQUDA}),true) +# When using QUDA, the back links are not used and just add unnecessary overhead + DSLASH_FN_MILC = dslash_fn.o +else DSLASH_FN_MILC = dslash_fn_dblstore.o +endif # No other choice DSLASH_EO = dslash_eo.o diff --git a/Makefile b/Makefile index c642023e9..8fcf49704 100644 --- a/Makefile +++ b/Makefile @@ -683,7 +683,15 @@ CGEOM +=# -DFIX_IONODE_GEOM # For now, works only with dslash_fn_dblstore.o # FEWSUMS Fewer CG reduction calls -KSCGSTORE = -DDBLSTORE_FN -DFEWSUMS -DD_FN_GATHER13 +# If we are using QUDA, the backward links are unused, so we should +# avoid unecessary overhead and use the standard dslash. Note that +# dslash_fn also has hooks in place to offload any dslash_fn_field +# calls to QUDA +ifeq ($(strip ${WANTQUDA}),true) + KSCGSTORE = -DFEWSUMS +else + KSCGSTORE = -DDBLSTORE_FN -DFEWSUMS -DD_FN_GATHER13 +endif #------------------------------ # Staggered fermion force routines diff --git a/generic/gauge_force_imp_gpu.c b/generic/gauge_force_imp_gpu.c index 3128fa0cd..5b1ca0c62 100644 --- a/generic/gauge_force_imp_gpu.c +++ b/generic/gauge_force_imp_gpu.c @@ -9,16 +9,17 @@ /**#define GFTIME**/ /* For timing gauge force calculation */ #include "generic_includes.h" /* definitions files and prototypes */ - -#include -#include -#include "../include/openmp_defs.h" - #include "../include/generic_quda.h" // gpu code void imp_gauge_force_gpu(Real eps, field_offset mom_off) { + +#ifdef GFTIME + int nflop = 153004; /* For Symanzik1 action */ + double dtime = -dclock(); +#endif + Real **loop_coeff = get_loop_coeff(); //int max_length = get_max_length(); //int nreps = get_nreps(); @@ -26,42 +27,29 @@ void imp_gauge_force_gpu(Real eps, field_offset mom_off) const int num_loop_types = get_nloop(); double *quda_loop_coeff = (double*)malloc(num_loop_types * sizeof(double)); int i; -#ifdef GFTIME - int nflop = 153004; /* For Symanzik1 action */ - double dtime = -dclock(); -#endif - + site *st; const Real eb3 = eps*beta/3.0; initialize_quda(); - su3_matrix *links = qudaAllocatePinned(sites_on_node*4*sizeof(su3_matrix)); - anti_hermitmat* momentum = qudaAllocatePinned(sites_on_node*4*sizeof(anti_hermitmat)); - - int dir,j; - site *st; + su3_matrix *links = create_G_from_site_quda(); + anti_hermitmat* momentum = create_M_quda(); for(i=0; ilink[dir]; - } // dir - } END_LOOP_OMP - qudaGaugeForce(PRECISION,num_loop_types,quda_loop_coeff,eb3,links,momentum); - FORALLSITES_OMP(i,st,private(dir,j)){ - for(dir=XUP; dir<=TUP; ++dir){ - for(j=0; j<10; ++j){ + FORALLSITES_OMP(i,st,){ + for(int dir=XUP; dir<=TUP; ++dir){ + for(int j=0; j<10; ++j){ ((Real*)&(st->mom[dir]))[j] += ((Real*)(momentum + 4*i+dir))[j]; } } } END_LOOP_OMP - + free(quda_loop_coeff); - qudaFreePinned(links); - qudaFreePinned(momentum); + destroy_G_quda(links); + destroy_M_quda(momentum); #ifdef GFTIME dtime+=dclock(); diff --git a/generic/reunitarize2.c b/generic/reunitarize2.c index 0f4b2ea6a..d6fb68f53 100644 --- a/generic/reunitarize2.c +++ b/generic/reunitarize2.c @@ -13,6 +13,10 @@ #include "generic_includes.h" #include "../include/openmp_defs.h" +#ifdef USE_GF_GPU +#include "../include/generic_quda.h" +#endif + #define TOLERANCE (0.0001) #define MAXERRCOUNT 100 /**#define UNIDEBUG**/ @@ -201,7 +205,35 @@ int reunit_su3(su3_matrix *c) } /* reunit_su3 */ -void reunitarize() { +#ifdef USE_GF_GPU + +void reunitarize_gpu() { + + initialize_quda(); + +#ifdef GFTIME + double dtime, dclock(); + dtime = -dclock(); +#endif + + su3_matrix *links = create_G_from_site_quda(); + + qudaUnitarizeSU3(PRECISION, links, TOLERANCE); + + copy_to_site_from_G_quda(links); // insert back into site + + destroy_G_quda(links); + +#ifdef GFTIME + dtime += dclock(); + node0_printf("REUNITARIZE: time = %e\n", dtime); +#endif + +} /* reunitarize2 */ + +#endif + +void reunitarize_cpu() { register su3_matrix *mat; register int i,dir; register site *s; @@ -210,7 +242,7 @@ void reunitarize() { max_deviation = 0.; av_deviation = 0.; - + FORALLSITES_OMP(i,s,private(dir,mat,errors) reduction(+:errcount) ){ #ifdef SCHROED_FUN for(dir=XUP; dir<=TUP; dir++ ) if(dir==TUP || s->t>0 ){ @@ -248,3 +280,21 @@ void reunitarize() { } /* reunitarize2 */ +void reunitarize() { + +#ifdef USE_GF_GPU + + /* Use QUDA if gauge-force is enabled for GPU, but fallback to CPU + if Schroedinger functional boundary conditions are enabled */ +#ifdef SCHROED_FUN + node0_printf("%s not supported on GPU, using CPU fallback\n", __func__); + reunitarize_cpu(); +#else + reunitarize_gpu(); +#endif + +#else + reunitarize_cpu(); +#endif + +} diff --git a/generic_ks/Make_template b/generic_ks/Make_template index 4806f1f56..7cba5a2c4 100644 --- a/generic_ks/Make_template +++ b/generic_ks/Make_template @@ -74,7 +74,6 @@ G_KS_ALL = \ fermion_links_fn_load_milc.o \ fermion_links_fn_utilities_gpu.o \ fermion_links_hisq_milc.o \ - fermion_links_hisq_load_gpu.o \ fermion_links_hisq_load_milc.o \ fermion_links_hisq_qop.o \ fermion_links_hyp.o \ @@ -324,8 +323,6 @@ fermion_links_fn_load_milc.o: ../generic_ks/fermion_links_fn_load_milc.c ${CC} -c ${CFLAGS} $< fermion_links_hisq_milc.o: ../generic_ks/fermion_links_hisq_milc.c ${CC} -c ${CFLAGS} $< -fermion_links_hisq_load_gpu.o: ../generic_ks/fermion_links_hisq_load_gpu.c - ${CC} -c ${CFLAGS} $< fermion_links_hisq_load_milc.o: ../generic_ks/fermion_links_hisq_load_milc.c ${CC} -c ${CFLAGS} $< fermion_links_hisq_qdp.o: ../generic_ks/fermion_links_hisq_qdp.c diff --git a/generic_ks/d_congrad5_fn_gpu.c b/generic_ks/d_congrad5_fn_gpu.c index 5154147a8..4c9666dbb 100644 --- a/generic_ks/d_congrad5_fn_gpu.c +++ b/generic_ks/d_congrad5_fn_gpu.c @@ -112,12 +112,11 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest, int num_iters; // for newer versions of QUDA we need to invalidate the gauge field if the links are new - static imp_ferm_links_t *fn_last = NULL; - if ( fn != fn_last || fresh_fn_links(fn) ){ + if ( fn != get_fn_last() || fresh_fn_links(fn) ){ cancel_quda_notification(fn); - fn_last = fn; + set_fn_last(fn); num_iters = -1; - node0_printf("%s: fn, notify: Signal QUDA to refresh links", myname); + node0_printf("%s: fn, notify: Signal QUDA to refresh links\n", myname); } qudaInvert(PRECISION, diff --git a/generic_ks/dslash_fn.c b/generic_ks/dslash_fn.c index 180ef0cfd..f231d5bad 100644 --- a/generic_ks/dslash_fn.c +++ b/generic_ks/dslash_fn.c @@ -299,14 +299,57 @@ void dslash_fn_site_special( field_offset src, field_offset dest, } +#ifdef USE_CG_GPU +#include "../include/generic_quda.h" + +// if using QUDA then we offload the dslash to the GPU void dslash_fn_field( su3_vector *src, su3_vector *dest, int parity, fn_links_t *fn) { + + su3_matrix* fatlink = get_fatlinks(fn); + su3_matrix* longlink = get_lnglinks(fn); + + // for newer versions of QUDA we need to invalidate the gauge field if the links are new + int num_iters; + if (fn != get_fn_last() || fresh_fn_links(fn)){ + cancel_quda_notification(fn); + set_fn_last(fn); + num_iters = -1; + node0_printf("%s: fn, notify: Signal QUDA to refresh links\n", __func__); + } + + QudaInvertArgs_t inv_args; + if (parity != EVENANDODD) { + switch(parity) { + case EVEN: inv_args.evenodd = QUDA_EVEN_PARITY; break; + case ODD: inv_args.evenodd = QUDA_ODD_PARITY; break; + default: printf("%s: Unrecognised parity\n",__func__); terminate(2); + } + + qudaDslash(PRECISION, PRECISION, inv_args, fatlink, longlink, u0, src, dest, &num_iters); + } else { // do both parities as separate calls + inv_args.evenodd = QUDA_EVEN_PARITY; + qudaDslash(PRECISION, PRECISION, inv_args, fatlink, longlink, u0, src, dest, &num_iters); + inv_args.evenodd = QUDA_ODD_PARITY; + qudaDslash(PRECISION, PRECISION, inv_args, fatlink, longlink, u0, src, dest, &num_iters); + } + +} + +#else + +void dslash_fn_field( su3_vector *src, su3_vector *dest, int parity, + fn_links_t *fn) { + msg_tag *tag[16]; - - dslash_fn_field_special(src, dest, parity, tag, 1, fn); - cleanup_one_gather_set(tag); + + dslash_fn_field_special(src, dest, parity, tag, 1, fn); + cleanup_one_gather_set(tag); + } +#endif + /* Special dslash for use by congrad. Uses restart_gather_field() when possible. Next to last argument is an array of message tags, to be set if this is the first use, otherwise reused. If start=1,use @@ -532,7 +575,7 @@ dslash_fn_dir(su3_vector *src, su3_vector *dest, int parity, { register int i ; site *s; - msg_tag *tag[2]; + msg_tag *tag[2] = {NULL, NULL}; su3_matrix *fat = get_fatlinks(fn); su3_matrix *lng = get_lnglinks(fn); su3_vector tmp; diff --git a/generic_ks/fermion_force_asqtad_gpu.c b/generic_ks/fermion_force_asqtad_gpu.c index 8fbb92d7e..ce7a0c3ff 100644 --- a/generic_ks/fermion_force_asqtad_gpu.c +++ b/generic_ks/fermion_force_asqtad_gpu.c @@ -31,11 +31,8 @@ fermion_force_oprod_site(Real eps, Real weight1, Real weight2, msg_tag* mtag[2]; { // copy the quark-field information to su3_vector fields - v[0] = (su3_vector*)malloc(sites_on_node*sizeof(su3_vector)); - v[1] = (su3_vector*)malloc(sites_on_node*sizeof(su3_vector)); - - if(v[0] == NULL) printf("fermion_force_oprod_site: v[0] not allocated\n"); - if(v[1] == NULL) printf("fermion_force_oprod_site: v[1] not allocated\n"); + v[0] = (su3_vector*)qudaAllocatePinned(sites_on_node*sizeof(su3_vector)); + v[1] = (su3_vector*)qudaAllocatePinned(sites_on_node*sizeof(su3_vector)); FORALLSITES(i,s){ v[0][i] = *(su3_vector*)F_PT(s,x1_off); @@ -67,8 +64,8 @@ fermion_force_oprod_site(Real eps, Real weight1, Real weight2, free(combined_coeff); // Cleanup - free(v[0]); - free(v[1]); + qudaFreePinned(v[0]); + qudaFreePinned(v[1]); } void diff --git a/generic_ks/fermion_links_fn_load_gpu.c b/generic_ks/fermion_links_fn_load_gpu.c index b45f549a6..b98665c43 100644 --- a/generic_ks/fermion_links_fn_load_gpu.c +++ b/generic_ks/fermion_links_fn_load_gpu.c @@ -7,13 +7,12 @@ /* Entry points load_fatlinks_gpu - + load_fatlonglinks_gpu + load_hisq_aux_links_gpu */ #include "generic_ks_includes.h" #include "../include/info.h" - -#include #include "../include/generic_quda.h" void @@ -31,16 +30,24 @@ load_fatlinks_gpu(info_t *info, su3_matrix *fat, ks_component_paths *p, su3_matr QudaFatLinkArgs_t fatlink_args; fatlink_args.su3_source = 0; // Cannot guarantee that the incoming field is an SU(3) gauge-field // Need a workaround for this - fatlink_args.use_pinned_memory = 0; initialize_quda(); qudaLoadKSLink(PRECISION, fatlink_args, path_coeff, links, fat, NULL); - return; + + /* Fatlinks */ + info->final_flop = 61632.*volume/numnodes(); + if( p->act_path_coeff.three_staple == 0.0 && + p->act_path_coeff.lepage == 0.0 && + p->act_path_coeff.five_staple == 0.0) + info->final_flop = 72.*volume/numnodes(); + /* Longlinks */ + info->final_flop += 1728.*volume/numnodes(); } void -load_fatlonglinks_gpu(info_t *info, su3_matrix *fatlinks, su3_matrix *longlinks, ks_component_paths *p, su3_matrix *links) +load_fatlonglinks_gpu(info_t *info, su3_matrix *fatlinks, su3_matrix *longlinks, + ks_component_paths *p, su3_matrix *links) { double path_coeff[6]; path_coeff[0] = p->act_path_coeff.one_link; @@ -53,13 +60,66 @@ load_fatlonglinks_gpu(info_t *info, su3_matrix *fatlinks, su3_matrix *longlinks, QudaFatLinkArgs_t fatlink_args; fatlink_args.su3_source = 0; // Cannot guarantee that the incoming field is an SU(3) gauge-field // Need a workaround for this - fatlink_args.use_pinned_memory = 0; initialize_quda(); // qudaLoadUnitarizedLink(PRECISION, fatlink_args, path_coeff, links, fatlinks, longlinks, NULL); qudaLoadKSLink(PRECISION, fatlink_args, path_coeff, links, fatlinks, longlinks); + /* Fatlinks */ + info->final_flop = 61632.*volume/numnodes(); + if( p->act_path_coeff.three_staple == 0.0 && + p->act_path_coeff.lepage == 0.0 && + p->act_path_coeff.five_staple == 0.0) + info->final_flop = 72.*volume/numnodes(); + /* Longlinks */ + info->final_flop += 1728.*volume/numnodes(); +} + +void +load_hisq_aux_links_gpu(info_t *info, ks_action_paths_hisq *ap, + hisq_auxiliary_t *aux, su3_matrix *links) +{ + char myname[] = "load_hisq_aux_links_gpu"; + + if(ap == NULL){ + printf("%s(%d): KS action paths not initialized\n", myname, this_node); + } + + // load U links (is this really necessary since we have extracted "links" already?) + memcpy(aux->U_link, links, 4*sizeof(su3_matrix)*sites_on_node); + + double path_coeff[6]; + path_coeff[0] = ap->p1.act_path_coeff.one_link; + path_coeff[1] = ap->p1.act_path_coeff.naik; + path_coeff[2] = ap->p1.act_path_coeff.three_staple; + path_coeff[3] = ap->p1.act_path_coeff.five_staple; + path_coeff[4] = ap->p1.act_path_coeff.seven_staple; + path_coeff[5] = ap->p1.act_path_coeff.lepage; + + QudaFatLinkArgs_t fatlink_args; + fatlink_args.su3_source = 1; // Is the incoming field an SU(3) gauge field? + // If so, run SU(3) optimized QUDA code. + + initialize_quda(); + + // Right now, if aux->V_link == NULL + // the level1 fat link is not copied from the GPU back to the CPU. + qudaLoadUnitarizedLink(PRECISION, fatlink_args, path_coeff, aux->U_link, aux->V_link, aux->W_unitlink); + + /* + The above equates to + - load_V_from_U: 61632 flops + - load_Y_from_V: (as CPU code: presently not counted) + - load_W_from_Y: (as CPU code: presently not counted) + */ + info->final_flop = 61632.*volume/numnodes(); + if( ap->p1.act_path_coeff.three_staple == 0.0 && + ap->p1.act_path_coeff.lepage == 0.0 && + ap->p1.act_path_coeff.five_staple == 0.0) + info->final_flop = 72.*volume/numnodes(); + + return; } /* fermion_links_fn_load_gpu.c */ diff --git a/generic_ks/fermion_links_fn_load_milc.c b/generic_ks/fermion_links_fn_load_milc.c index 2603153ce..c1c5b5c27 100644 --- a/generic_ks/fermion_links_fn_load_milc.c +++ b/generic_ks/fermion_links_fn_load_milc.c @@ -312,16 +312,6 @@ void load_fn_links_gpu(info_t *info, fn_links_t *fn, ks_action_paths *ap, else destroy_fn_backlinks(fn); - /* Use MILC algorithm flop count until QUDA can give us one */ - /* Fatlinks */ - info->final_flop = 61632.*volume/numnodes(); - if( p->act_path_coeff.three_staple == 0.0 && - p->act_path_coeff.lepage == 0.0 && - p->act_path_coeff.five_staple == 0.0) - info->final_flop = 72.*volume/numnodes(); - /* Longlinks */ - info->final_flop += 1728.*volume/numnodes(); /* (formerly 1804) */ - dtime += dclock(); info->final_sec = dtime; } diff --git a/generic_ks/fermion_links_from_site.c b/generic_ks/fermion_links_from_site.c index 10bedaba4..f2762673f 100644 --- a/generic_ks/fermion_links_from_site.c +++ b/generic_ks/fermion_links_from_site.c @@ -1,17 +1,24 @@ /******************** fermion_links_from_site.c ****************************/ /* MIMD version 7 */ -/* Teporary routines until we have removed the gauge file from the +/* Temporary routines until we have removed the gauge file from the site structure */ #include "generic_ks_includes.h" #include "../include/fermion_links.h" +#ifdef USE_FL_GPU +#include "../include/generic_quda.h" +#endif + fermion_links_t *create_fermion_links_from_site(int prec, int n_naiks, double *eps_naik){ - su3_matrix *links; fermion_links_t *fl; - links = create_G_from_site(); +#ifdef USE_FL_GPU + su3_matrix *links = create_G_from_site_quda(); +#else + su3_matrix *links = create_G_from_site(); +#endif #if FERM_ACTION == HISQ fl = create_fermion_links_hisq(prec, n_naiks, eps_naik, phases_in, links); @@ -21,16 +28,23 @@ fermion_links_t *create_fermion_links_from_site(int prec, int n_naiks, double *e fl = create_fermion_links(prec, phases_in, links); #endif +#ifdef USE_FL_GPU + destroy_G_quda(links); +#else free(links); +#endif return fl; } void restore_fermion_links_from_site(fermion_links_t *fl, int prec){ - su3_matrix *links; if(valid_fermion_links(fl, prec))return; - links = create_G_from_site(); +#ifdef USE_FL_GPU + su3_matrix *links = create_G_from_site_quda(); +#else + su3_matrix *links = create_G_from_site(); +#endif #if FERM_ACTION == HISQ restore_fermion_links_hisq(fl, prec, phases_in, links); @@ -40,5 +54,9 @@ void restore_fermion_links_from_site(fermion_links_t *fl, int prec){ restore_fermion_links(fl, prec, phases_in, links); #endif +#ifdef USE_FL_GPU + destroy_G_quda(links); +#else free(links); +#endif } diff --git a/generic_ks/fermion_links_hisq_load_gpu.c b/generic_ks/fermion_links_hisq_load_gpu.c deleted file mode 100644 index 5c918ac03..000000000 --- a/generic_ks/fermion_links_hisq_load_gpu.c +++ /dev/null @@ -1,55 +0,0 @@ -/**************** fermion_links_hisq_load_gpu.c **********************/ -/* MILC Version 7 */ - -/* Foley 2012 */ - -/* Entry points - - load_hisq_aux_links_gpu - -*/ - -#include "generic_ks_includes.h" -#include "../include/info.h" -#include - -#include -#include "../include/generic_quda.h" - -void -load_hisq_aux_links_gpu(info_t *info, ks_action_paths_hisq *ap, - hisq_auxiliary_t *aux, su3_matrix *links) -{ - char myname[] = "load_hisq_aux_links_gpu"; - - if(ap == NULL){ - printf("%s(%d): KS action paths not initialized\n", myname, this_node); - } - - // load U links - memcpy(aux->U_link, links, 4*sizeof(su3_matrix)*sites_on_node); - - - double path_coeff[6]; - path_coeff[0] = ap->p1.act_path_coeff.one_link; - path_coeff[1] = ap->p1.act_path_coeff.naik; - path_coeff[2] = ap->p1.act_path_coeff.three_staple; - path_coeff[3] = ap->p1.act_path_coeff.five_staple; - path_coeff[4] = ap->p1.act_path_coeff.seven_staple; - path_coeff[5] = ap->p1.act_path_coeff.lepage; - - QudaFatLinkArgs_t fatlink_args; - fatlink_args.su3_source = 1; // Is the incoming field an SU(3) gauge field? - // If so, run SU(3) optimized QUDA code. - fatlink_args.use_pinned_memory = 0; // Use page-locked memory in QUDA? - - initialize_quda(); - - // Right now, if aux->V_link == NULL - // the level1 fat link is not copied from the GPU back to the CPU. - qudaLoadUnitarizedLink(PRECISION, fatlink_args, path_coeff, aux->U_link, aux->V_link, aux->W_unitlink); - - return; -} - -/* fermion_links_hisq_load_gpu.c */ diff --git a/generic_ks/fermion_links_hisq_load_milc.c b/generic_ks/fermion_links_hisq_load_milc.c index 809ab709d..449ff8a26 100644 --- a/generic_ks/fermion_links_hisq_load_milc.c +++ b/generic_ks/fermion_links_hisq_load_milc.c @@ -21,6 +21,10 @@ #ifdef QCDOC #define special_alloc qcdoc_alloc #define special_free qfree +#elif defined(USE_FL_GPU) +#include "../include/generic_quda.h" +#define special_alloc qudaAllocatePinned +#define special_free qudaFreePinned #else #define special_alloc malloc #define special_free free @@ -512,6 +516,7 @@ load_X_from_W(info_t *info, fn_links_t *fn, hisq_auxiliary_t *aux, double dtime = -dclock(); #ifdef USE_FL_GPU load_fatlonglinks_gpu(info, fat, lng, ap, aux->W_unitlink); + final_flop += info->final_flop; #else load_fatlinks(info, fat, ap, aux->W_unitlink ); final_flop += info->final_flop; @@ -784,6 +789,7 @@ create_hisq_links_milc(info_t *info, fn_links_t **fn, fn_links_t **fn_deps, dtime += dclock(); info->final_sec = dtime; + info->final_flop = final_flop; } void diff --git a/generic_ks/fn_links_milc.c b/generic_ks/fn_links_milc.c index 0aaa4fcfc..212719098 100644 --- a/generic_ks/fn_links_milc.c +++ b/generic_ks/fn_links_milc.c @@ -10,6 +10,10 @@ #ifdef QCDOC #define special_alloc qcdoc_alloc #define special_free qfree +#elif defined(USE_FL_GPU) +#include "../include/generic_quda.h" +#define special_alloc qudaAllocatePinned +#define special_free qudaFreePinned #else #define special_alloc malloc #define special_free free diff --git a/generic_ks/ks_multicg_offset_gpu.c b/generic_ks/ks_multicg_offset_gpu.c index 63f583a72..d2f3d030b 100644 --- a/generic_ks/ks_multicg_offset_gpu.c +++ b/generic_ks/ks_multicg_offset_gpu.c @@ -22,6 +22,19 @@ static const char *prec_label[2] = {"F", "D"}; #endif +// this is used to store the most recent fermion link field passed to QUDA +static imp_ferm_links_t *fn_last = NULL; + +// return the most recent fermion link field passed to QUDA +imp_ferm_links_t* get_fn_last() { + return fn_last; +} + +// update the fermion link field passed to QUDA +void set_fn_last(imp_ferm_links_t *fn_last_new) { + fn_last = fn_last_new; +} + int ks_multicg_offset_field_gpu( su3_vector *src, su3_vector **psim, @@ -150,18 +163,18 @@ int ks_multicg_offset_field_gpu( // for newer versions of QUDA we need to invalidate the gauge field if the naik term changes to prevent caching static imp_ferm_links_t *fn_last = NULL; - if ( fn != fn_last || fresh_fn_links(fn) ){ + if ( fn != get_fn_last() || fresh_fn_links(fn) ){ cancel_quda_notification(fn); - fn_last = fn; + set_fn_last(fn); num_iters = -1; - node0_printf("%s: fn, notify: Signal QUDA to refresh links", myname); + node0_printf("%s: fn, notify: Signal QUDA to refresh links\n", myname); } static int naik_term_epsilon_index = -1; if ( naik_term_epsilon_index != ksp[0].naik_term_epsilon_index) { num_iters = -1; // temporary back door hack to invalidate gauge fields since naik index has changed naik_term_epsilon_index = ksp[0].naik_term_epsilon_index; - node0_printf("%s: naik_epsilon: Signal QUDA to refresh links", myname); + node0_printf("%s: naik_epsilon: Signal QUDA to refresh links\n", myname); } qudaMultishiftInvert( diff --git a/include/generic_quda.h b/include/generic_quda.h index 1095dc5cc..3d26c2e19 100644 --- a/include/generic_quda.h +++ b/include/generic_quda.h @@ -5,9 +5,133 @@ */ #include +#include "../include/openmp_defs.h" #ifdef HAVE_QUDA int initialize_quda(void); #endif +#ifdef USE_FAST_X86_COPY +#include + +static inline void *__movsb(void *d, const void *s, size_t n) { + __asm volatile ("rep movsb" + : "=D" (d), + "=S" (s), + "=c" (n) + : "0" (d), + "1" (s), + "2" (n) + : "memory"); + return d; +} + +static inline void fast_copy(void *dest, const void *src, size_t n) { + __movsb(dest, src, n); +} + +#else + +#include + +static inline void fast_copy(void *dest, const void *src, size_t n) { + memcpy(dest, src, n); +} + +#endif + +/* + Allocate a pinned gauge-field array suitable for DMA transfer to the GPU + */ +static su3_matrix* create_G_quda(void) { + return (su3_matrix*)qudaAllocatePinned(sites_on_node*4*sizeof(su3_matrix)); +} + +/* + Extract the gauge field elements into a pinned array suitable for DMA transfer to the GPU + */ +static su3_matrix* create_G_from_site_quda(void) { + su3_matrix *links = create_G_quda(); + int i; + site *s; + + FORALLSITES_OMP(i,s,){ + fast_copy(links+4*i, s->link, 4*sizeof(su3_matrix)); + } END_LOOP_OMP + + return links; +} + +/* + Copy the momentum field elements into the site struct array + */ +static void copy_to_site_from_G_quda(su3_matrix *links) { + int i; + site *s; + + FORALLSITES_OMP(i,s,){ + fast_copy(s->link, links+4*i, 4*sizeof(su3_matrix)); + } END_LOOP_OMP +} + +/* + Free the pinned gauge-field array + */ +static void destroy_G_quda(su3_matrix *links) { + qudaFreePinned(links); +} + +/* + Allocate a pinned momentum-field array suitable for DMA transfer to the GPU + */ +static anti_hermitmat* create_M_quda(void) { + return (anti_hermitmat*)qudaAllocatePinned(sites_on_node*4*sizeof(anti_hermitmat)); +} + +/* + Extract the momentum field elements into a pinned array suitable for DMA transfer to the GPU + */ +static anti_hermitmat* create_M_from_site_quda(void) { + anti_hermitmat* momentum = create_M_quda(); + int i; + site *s; + + FORALLSITES_OMP(i,s,){ + fast_copy(momentum+4*i, s->mom, 4*sizeof(anti_hermitmat)); + } END_LOOP_OMP + + return momentum; +} + +/* + Copy the momentum field elements into the site struct array + */ +static void copy_to_site_from_M_quda(anti_hermitmat *momentum) { + int i; + site *s; + + FORALLSITES_OMP(i,s,){ + fast_copy(s->mom, momentum+4*i, 4*sizeof(anti_hermitmat)); + } END_LOOP_OMP +} + +/* + Free the pinned gauge-field array + */ +static void destroy_M_quda(anti_hermitmat *momentum) { + qudaFreePinned(momentum); +} + +/* + Return the most recent fermion link field passed to QUDA + (defined in generic_ks/ks_multicg_offset_gpu.c) +*/ +imp_ferm_links_t* get_fn_last(); + +/* + Update the fermion link field passed to QUDA + (defined in generic_ks/ks_multicg_offset_gpu.c) +*/ +void set_fn_last(imp_ferm_links_t *fn_last_new); + #endif /* GENERIC_QUDA_H */ diff --git a/ks_imp_rhmc/update_u.c b/ks_imp_rhmc/update_u.c index 02e91d41f..a117eb673 100644 --- a/ks_imp_rhmc/update_u.c +++ b/ks_imp_rhmc/update_u.c @@ -22,10 +22,6 @@ void update_u(Real eps){ - int i,dir; - site *s; - int j; - #ifdef FN invalidate_fermion_links(fn_links); #endif @@ -37,32 +33,15 @@ void update_u(Real eps){ dtime = -dclock(); #endif - anti_hermitmat *momentum = qudaAllocatePinned(sites_on_node*4*sizeof(anti_hermitmat)); - su3_matrix *gauge = qudaAllocatePinned(sites_on_node*4*sizeof(su3_matrix)); - - // Populate gauge and momentum fields - FORALLSITES_OMP(i,s,private(dir)){ - for(dir=XUP; dir<=TUP; ++dir) { - gauge[4*i + dir] = s->link[dir]; - } // dir - for(dir=XUP; dir<=TUP; ++dir) { - momentum[4*i + dir] = s->mom[dir]; - } // dir - } END_LOOP_OMP + su3_matrix *links = create_G_from_site_quda(); + anti_hermitmat* momentum = create_M_from_site_quda(); - qudaUpdateU(PRECISION, eps, momentum, gauge); + qudaUpdateU(PRECISION, eps, momentum, links); - // Copy updated gauge field back to site structure - FORALLSITES_OMP(i,s,private(dir)){ - for(dir=XUP; dir<=TUP; ++dir){ - for(j=0; j<18; ++j){ - s->link[dir] = gauge[4*i + dir]; - } - } - } END_LOOP_OMP + copy_to_site_from_G_quda(links); // insert back into site - qudaFreePinned(momentum); - qudaFreePinned(gauge); + destroy_G_quda(links); + destroy_M_quda(momentum); #ifdef GFTIME dtime += dclock();