Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/batched quda deflation #76

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions generic/milc_to_quda_utilities.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ int initialize_quda(void){
void finalize_quda(void){
#ifdef USE_CG_GPU

qudaCleanUpDeflationSpace();
#ifdef MULTIGRID
mat_invert_mg_cleanup();
#endif
Expand Down
100 changes: 100 additions & 0 deletions generic_ks/d_congrad5_fn_quda.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,56 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest,
inv_args.tadpole = u0;
#endif

// Setup for deflation (and eigensolve) on GPU
int parity = qic->parity;
int blockSize = param.eigen_param.blockSize;
static Real previous_mass = -1.0;
static bool first_solve=true;

QudaEigParam qep = newQudaEigParam();
qep.block_size = blockSize;
qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */
qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */
qep.n_conv = (param.eigen_param.Nvecs_in > param.eigen_param.Nvecs) ? param.eigen_param.Nvecs_in : param.eigen_param.Nvecs;
qep.n_ev_deflate = param.eigen_param.Nvecs;
qep.n_ev = qep.n_conv;
qep.n_kr = (param.eigen_param.Nkr < qep.n_ev ) ? 2*qep.n_ev : param.eigen_param.Nkr;
qep.tol = param.eigen_param.tol;
qep.qr_tol = qep.tol;
qep.max_restarts = param.eigen_param.MaxIter;
qep.require_convergence = QUDA_BOOLEAN_TRUE;
qep.check_interval = 10;
qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
qep.use_dagger = QUDA_BOOLEAN_FALSE;
qep.compute_gamma5 = QUDA_BOOLEAN_FALSE;
qep.compute_svd = QUDA_BOOLEAN_FALSE;
qep.use_eigen_qr = QUDA_BOOLEAN_TRUE;
qep.use_poly_acc = QUDA_BOOLEAN_TRUE;
qep.poly_deg = param.eigen_param.poly.norder;
qep.a_min = param.eigen_param.poly.minE;
qep.a_max = param.eigen_param.poly.maxE;
qep.arpack_check = QUDA_BOOLEAN_FALSE;
strcpy( qep.vec_infile, param.ks_eigen_startfile );
strcpy( qep.vec_outfile, param.ks_eigen_savefile );
qep.io_parity_inflate = QUDA_BOOLEAN_FALSE;
qep.compute_evals_batch_size = 16;
qep.preserve_deflation = QUDA_BOOLEAN_TRUE;
qep.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
qep.batched_rotate = param.eigen_param.batchedRotate;
qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters?
qep.partfile = param.eigen_param.partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;

inv_args.eig_param = qep;
if(param.eigen_param.eigPrec == 2)inv_args.prec_eigensolver = QUDA_DOUBLE_PRECISION;
else if(param.eigen_param.eigPrec == 1)inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION;
else inv_args.prec_eigensolver = QUDA_HALF_PRECISION;

inv_args.tol_restart = param.eigen_param.tol_restart;

previous_mass = mass;
first_solve = false;

qudaInvert(MILC_PRECISION,
quda_precision,
mass,
Expand Down Expand Up @@ -280,6 +330,56 @@ int ks_congrad_block_parity_gpu(int nsrc, su3_vector **t_src, su3_vector **t_des
inv_args.tadpole = u0;
#endif

// Setup for deflation (and eigensolve) on GPU
int parity = qic->parity;
int blockSize = param.eigen_param.blockSize;
static Real previous_mass = -1.0;
static bool first_solve=true;

QudaEigParam qep = newQudaEigParam();
qep.block_size = blockSize;
qep.eig_type = ( blockSize > 1 ) ? QUDA_EIG_BLK_TR_LANCZOS : QUDA_EIG_TR_LANCZOS; /* or QUDA_EIG_IR_ARNOLDI, QUDA_EIG_BLK_IR_ARNOLDI */
qep.spectrum = QUDA_SPECTRUM_SR_EIG; /* Smallest Real. Other options: LM, SM, LR, SR, LI, SI */
qep.n_conv = (param.eigen_param.Nvecs_in > param.eigen_param.Nvecs) ? param.eigen_param.Nvecs_in : param.eigen_param.Nvecs;
qep.n_ev_deflate = param.eigen_param.Nvecs;
qep.n_ev = qep.n_conv;
qep.n_kr = (param.eigen_param.Nkr < qep.n_ev ) ? 2*qep.n_ev : param.eigen_param.Nkr;
qep.tol = param.eigen_param.tol;
qep.qr_tol = qep.tol;
qep.max_restarts = param.eigen_param.MaxIter;
qep.require_convergence = QUDA_BOOLEAN_TRUE;
qep.check_interval = 10;
qep.use_norm_op = ( parity == EVENANDODD ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
qep.use_pc = ( parity != EVENANDODD) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
qep.use_dagger = QUDA_BOOLEAN_FALSE;
qep.compute_gamma5 = QUDA_BOOLEAN_FALSE;
qep.compute_svd = QUDA_BOOLEAN_FALSE;
qep.use_eigen_qr = QUDA_BOOLEAN_TRUE;
qep.use_poly_acc = QUDA_BOOLEAN_TRUE;
qep.poly_deg = param.eigen_param.poly.norder;
qep.a_min = param.eigen_param.poly.minE;
qep.a_max = param.eigen_param.poly.maxE;
qep.arpack_check = QUDA_BOOLEAN_FALSE;
strcpy( qep.vec_infile, param.ks_eigen_startfile );
strcpy( qep.vec_outfile, param.ks_eigen_savefile );
qep.io_parity_inflate = QUDA_BOOLEAN_FALSE;
qep.compute_evals_batch_size = 16;
qep.preserve_deflation = QUDA_BOOLEAN_TRUE;
qep.preserve_evals = ( first_solve || fabs(mass - previous_mass) < 1e-6 ) ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
qep.batched_rotate = param.eigen_param.batchedRotate;
qep.save_prec = QUDA_SINGLE_PRECISION; // add to input parameters?
qep.partfile = param.eigen_param.partfile ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;

inv_args.eig_param = qep;
if(param.eigen_param.eigPrec == 2)inv_args.prec_eigensolver = QUDA_DOUBLE_PRECISION;
else if(param.eigen_param.eigPrec == 1)inv_args.prec_eigensolver = QUDA_SINGLE_PRECISION;
else inv_args.prec_eigensolver = QUDA_HALF_PRECISION;

inv_args.tol_restart = param.eigen_param.tol_restart;

previous_mass = mass;
first_solve = false;

qudaInvertMsrc(MILC_PRECISION,
quda_precision,
mass,
Expand Down
42 changes: 37 additions & 5 deletions generic_ks/mat_invert.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ int mat_invert_cg_field(su3_vector *src, su3_vector *dst,
ks_dirac_adj_op( src, tmp, mass, EVENANDODD, fn);

/* Do deflation if we have eigenvectors and the deflate parameter is true */

/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){

dtime = - dclock();
Expand All @@ -222,12 +223,15 @@ int mat_invert_cg_field(su3_vector *src, su3_vector *dst,
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
#endif
}
#endif

/* dst_e <- (M_adj M)^-1 tmp_e (even sites only) */
qic->parity = EVEN;
int cgn = ks_congrad_field( tmp, dst, qic, mass, fn );
int even_iters = qic->final_iters;

/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){

dtime = - dclock();
Expand All @@ -242,6 +246,7 @@ int mat_invert_cg_field(su3_vector *src, su3_vector *dst,
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
#endif
}
#endif

/* dst_o <- (M_adj M)^-1 tmp_o (odd sites only) */
qic->parity = ODD;
Expand Down Expand Up @@ -277,6 +282,8 @@ int mat_invert_cgz_field(su3_vector *src, su3_vector *dst,

/* Put "exact" low-mode even-site solution in tmp if deflate parameter is true */

/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){

dtime = - dclock();
Expand All @@ -292,6 +299,7 @@ int mat_invert_cgz_field(su3_vector *src, su3_vector *dst,
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
#endif
}
#endif

/* Solve for all modes using tmp as an initial guess */
/* tmp_e <- (M_adj M)^-1 src_e (even sites only) */
Expand All @@ -301,6 +309,8 @@ int mat_invert_cgz_field(su3_vector *src, su3_vector *dst,

/* Put "exact" low-mode odd-site solution in tmp if deflate parameter is true */

/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){

dtime = - dclock();
Expand All @@ -316,6 +326,7 @@ int mat_invert_cgz_field(su3_vector *src, su3_vector *dst,
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
#endif
}
#endif

/* Solve for all modes using tmp as an initial guess */
/* tmp_o <- (M_adj M)^-1 src_o (odd sites only) */
Expand Down Expand Up @@ -427,6 +438,8 @@ int mat_invert_uml_field(su3_vector *src, su3_vector *dst,
ks_dirac_adj_op( src, tmp, mass, EVENANDODD, fn );

#if EIGMODE != EIGCG
/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){

dtime = - dclock();
Expand All @@ -439,6 +452,7 @@ int mat_invert_uml_field(su3_vector *src, su3_vector *dst,
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
#endif
}
#endif
#endif

/* dst_e <- (M_adj M)^-1 tmp_e (even sites only) */
Expand All @@ -460,6 +474,8 @@ int mat_invert_uml_field(su3_vector *src, su3_vector *dst,
} END_LOOP_OMP;

#if EIGMODE != EIGCG
/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){

dtime = - dclock();
Expand All @@ -470,6 +486,7 @@ int mat_invert_uml_field(su3_vector *src, su3_vector *dst,
dtime += dclock();
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
}
#endif
#endif

/* Polish off odd sites to correct for possible roundoff error */
Expand Down Expand Up @@ -711,7 +728,8 @@ int mat_invert_block_cg(su3_vector **src, su3_vector **dst,
}

/* Put "exact" low-mode even-site solution in tmp if deflate parameter is true */

/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){

dtime = -dclock();
Expand All @@ -728,13 +746,16 @@ int mat_invert_block_cg(su3_vector **src, su3_vector **dst,
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
#endif
}
#endif

/* dst_e <- (M_adj M)^-1 tmp_e (even sites only) */
qic->parity = EVEN;
int cgn = ks_congrad_block_field( nsrc, tmp, dst, qic, mass, fn );
int even_iters = qic->final_iters;

/* Deflation on odd sites */
/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){

dtime = -dclock();
Expand All @@ -750,6 +771,7 @@ int mat_invert_block_cg(su3_vector **src, su3_vector **dst,
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
#endif
}
#endif

/* dst_o <- (M_adj M)^-1 tmp_o (odd sites only) */
qic->parity = ODD;
Expand Down Expand Up @@ -780,7 +802,8 @@ int mat_invert_block_cgz(su3_vector **src, su3_vector **dst,
tmp[is] = create_v_field();

/* Put "exact" low-mode even-site solution in tmp if deflate parameter is true */

/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){
for(int is = 0; is < nsrc; is++){

Expand All @@ -799,6 +822,7 @@ int mat_invert_block_cgz(su3_vector **src, su3_vector **dst,
#endif
}
}
#endif

/* Solve for all modes on even sites using tmp as an initial guess */
/* tmp_e <- (M_adj M)^-1 src_e (even sites only) */
Expand All @@ -807,7 +831,8 @@ int mat_invert_block_cgz(su3_vector **src, su3_vector **dst,
int even_iters = qic->final_iters;

/* Put "exact" low-mode odd-site solution in tmp if deflate parameter is true */

/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){
for(int is = 0; is < nsrc; is++){

Expand All @@ -825,6 +850,7 @@ int mat_invert_block_cgz(su3_vector **src, su3_vector **dst,
#endif
}
}
#endif

/* Solve for all modes on odd sites using tmp as an initial guess */
/* dst_o <- (M_adj M)^-1 tmp_o (odd sites only) */
Expand Down Expand Up @@ -870,7 +896,9 @@ int mat_invert_block_uml(su3_vector **src, su3_vector **dst,

for(int is = 0; is < nsrc; is++){
ks_dirac_adj_op( src[is], tmp[is], mass, EVENANDODD, fn );


/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){
dtime = - dclock();
#ifdef CGTIME
Expand All @@ -884,6 +912,7 @@ int mat_invert_block_uml(su3_vector **src, su3_vector **dst,
dtime += dclock();
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
}
#endif
}

/* dst_e <- (M_adj M)^-1 tmp_e (even sites only) */
Expand All @@ -900,6 +929,8 @@ int mat_invert_block_uml(su3_vector **src, su3_vector **dst,
scalar_mult_su3_vector( dst[is]+i, 1.0/(2.0*mass), dst[is]+i );
} END_LOOP_OMP;

/* Skip MILC CPU deflation if using QUDA deflation */
#if !( defined(USE_CG_GPU) && defined(HAVE_QUDA) )
if(param.eigen_param.Nvecs > 0 && qic->deflate){
dtime = - dclock();
node0_printf("deflating on odd sites for mass %g with %d eigenvec\n",
Expand All @@ -910,6 +941,7 @@ int mat_invert_block_uml(su3_vector **src, su3_vector **dst,
dtime += dclock();
node0_printf("Time to deflate %d modes %g\n", param.eigen_param.Nvecs, dtime);
}
#endif
}

/* Polish off odd sites to correct for possible roundoff error */
Expand Down
9 changes: 8 additions & 1 deletion generic_ks/read_eigen_param.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,20 @@ int read_ks_eigen_param(ks_eigen_param *eigen_param, int status, int prompt){
IF_OK status += get_f(stdin, prompt, "Chebyshev_beta", &eigen_param->poly.maxE );
IF_OK status += get_i(stdin, prompt, "Chebyshev_order", &eigen_param->poly.norder );
IF_OK status += get_s(stdin, prompt, "diag_algorithm", param.eigen_param.diagAlg );


#elif defined(HAVE_QUDA) && defined(USE_CG_GPU) && !defined(USE_EIG_GPU)
node0_printf("ERROR: When using QUDA for CG and wanting FRESH eigenvectors, only the QUDA eigensolver is allowed!\n");
node0_printf("ERROR: Recompile with WANT_QUDA=true, WANT_CG_GPU=true, and WANT_EIG_GPU=true\n");
terminate(1);

#elif defined(HAVE_QUDA) && defined(USE_EIG_GPU)

IF_OK status += get_i(stdin, prompt, "Max_Lanczos_restart_iters", &eigen_param->MaxIter );
IF_OK status += get_f(stdin, prompt, "eigenval_tolerance", &eigen_param->tol );
IF_OK status += get_i(stdin, prompt, "Lanczos_max", &eigen_param->Nkr );
IF_OK status += get_i(stdin, prompt, "Lanczos_restart", &eigen_param->Nrestart );
IF_OK status += get_i(stdin, prompt, "eigensolver_prec", &eigen_param->eigPrec );
IF_OK status += get_i(stdin, prompt, "batched_rotate", &eigen_param->batchedRotate );
IF_OK status += get_f(stdin, prompt, "Chebyshev_alpha", &eigen_param->poly.minE );
IF_OK status += get_f(stdin, prompt, "Chebyshev_beta", &eigen_param->poly.maxE );
IF_OK status += get_i(stdin, prompt, "Chebyshev_order", &eigen_param->poly.norder );
Expand Down
7 changes: 6 additions & 1 deletion include/imp_ferm_links.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,11 @@ typedef struct {
int Nkr; /* size of the Krylov subspace */
ks_eigen_poly poly; /* Preconditioning polynomial */
int blockSize; /* block size for block variant eigensolvers */
int parity;
int partfile; /* Whether to save in partfile or not */
int eigPrec; /* Run the eigensolver in this precision */
int batchedRotate; /* Size of the rotation space to use for solver */
int parity;
double tol_restart;
} ks_eigen_param;
#elif defined(HAVE_QDP)
#define ks_eigensolve Kalkreuter
Expand All @@ -371,6 +375,7 @@ typedef struct {
int Restart ; /* Restart Rayleigh every so many iterations */
int Kiters ; /* Kalkreuter iterations */
int parity;
double tol_restart;
} ks_eigen_param;
#endif

Expand Down
Loading