diff --git a/Make_template_combos b/Make_template_combos
index ed9aa3181..aefa3b1fd 100644
--- a/Make_template_combos
+++ b/Make_template_combos
@@ -172,7 +172,7 @@ endif
   
   FLINKS_HISQ_MILC_GPU = fermion_links_hisq_milc.o \
 	fermion_links_fn_load_gpu.o \
-	fermion_links_hisq_load_milc.o fermion_links_hisq_load_gpu.o \
+	fermion_links_hisq_load_milc.o \
 	${FLINKS} ks_action_paths_hisq.o su3_mat_op.o stout_smear.o
 
 # Standard QOP combinations
@@ -206,7 +206,12 @@ endif
 # Standard MILC
 
 # Choices here are dslash_fn.o dslash_fn2.o dslash_fn_dblstore.o
+ifeq ($(strip ${WANTQUDA}),true)
+# When using QUDA, the back links are not used and just add unnecessary overhead
+  DSLASH_FN_MILC = dslash_fn.o
+else
   DSLASH_FN_MILC = dslash_fn_dblstore.o
+endif
 
 # No other choice
   DSLASH_EO = dslash_eo.o
diff --git a/Makefile b/Makefile
index c642023e9..8fcf49704 100644
--- a/Makefile
+++ b/Makefile
@@ -683,7 +683,15 @@ CGEOM +=# -DFIX_IONODE_GEOM
 #                For now, works only with dslash_fn_dblstore.o
 # FEWSUMS        Fewer CG reduction calls
 
-KSCGSTORE = -DDBLSTORE_FN -DFEWSUMS -DD_FN_GATHER13 
+# If we are using QUDA, the backward links are unused, so we should
+# avoid unecessary overhead and use the standard dslash.  Note that
+# dslash_fn also has hooks in place to offload any dslash_fn_field
+# calls to QUDA
+ifeq ($(strip ${WANTQUDA}),true)
+  KSCGSTORE = -DFEWSUMS
+else
+  KSCGSTORE = -DDBLSTORE_FN -DFEWSUMS -DD_FN_GATHER13
+endif
 
 #------------------------------
 # Staggered fermion force routines
diff --git a/generic/gauge_force_imp_gpu.c b/generic/gauge_force_imp_gpu.c
index 3128fa0cd..5b1ca0c62 100644
--- a/generic/gauge_force_imp_gpu.c
+++ b/generic/gauge_force_imp_gpu.c
@@ -9,16 +9,17 @@
 
 /**#define GFTIME**/ /* For timing gauge force calculation */
 #include "generic_includes.h"	/* definitions files and prototypes */
-
-#include <quda.h>
-#include <quda_milc_interface.h>
-#include "../include/openmp_defs.h"
-
 #include "../include/generic_quda.h"
 
 // gpu code 
 void imp_gauge_force_gpu(Real eps, field_offset mom_off)
 {
+
+#ifdef GFTIME
+  int nflop = 153004;  /* For Symanzik1 action */
+  double dtime = -dclock();
+#endif
+
   Real **loop_coeff = get_loop_coeff();
   //int max_length = get_max_length();
   //int nreps = get_nreps();
@@ -26,42 +27,29 @@ void imp_gauge_force_gpu(Real eps, field_offset mom_off)
   const int num_loop_types = get_nloop();
   double *quda_loop_coeff = (double*)malloc(num_loop_types * sizeof(double));
   int i;
-#ifdef GFTIME
-  int nflop = 153004;  /* For Symanzik1 action */
-  double dtime = -dclock();
-#endif
-
+  site *st;
   const Real eb3 = eps*beta/3.0;
   
   initialize_quda();
 
-  su3_matrix *links = qudaAllocatePinned(sites_on_node*4*sizeof(su3_matrix));
-  anti_hermitmat* momentum = qudaAllocatePinned(sites_on_node*4*sizeof(anti_hermitmat));
-
-  int dir,j;
-  site *st;
+  su3_matrix *links = create_G_from_site_quda();
+  anti_hermitmat* momentum = create_M_quda();
 
   for(i=0; i<num_loop_types; ++i) quda_loop_coeff[i] = loop_coeff[i][0];
 
-  FORALLSITES_OMP(i,st,private(dir)){
-    for(dir=XUP; dir<=TUP; ++dir){
-      links[4*i + dir] = st->link[dir];
-    } // dir
-  } END_LOOP_OMP
-
   qudaGaugeForce(PRECISION,num_loop_types,quda_loop_coeff,eb3,links,momentum);
 
-  FORALLSITES_OMP(i,st,private(dir,j)){
-    for(dir=XUP; dir<=TUP; ++dir){
-      for(j=0; j<10; ++j){
+  FORALLSITES_OMP(i,st,){
+    for(int dir=XUP; dir<=TUP; ++dir){
+      for(int j=0; j<10; ++j){
 	((Real*)&(st->mom[dir]))[j] += ((Real*)(momentum + 4*i+dir))[j];
       }
     }
   } END_LOOP_OMP
-
+  
   free(quda_loop_coeff);
-  qudaFreePinned(links);
-  qudaFreePinned(momentum);
+  destroy_G_quda(links);
+  destroy_M_quda(momentum);
 
 #ifdef GFTIME
   dtime+=dclock();
diff --git a/generic/reunitarize2.c b/generic/reunitarize2.c
index 0f4b2ea6a..d6fb68f53 100644
--- a/generic/reunitarize2.c
+++ b/generic/reunitarize2.c
@@ -13,6 +13,10 @@
 #include "generic_includes.h"
 #include "../include/openmp_defs.h"
 
+#ifdef USE_GF_GPU
+#include "../include/generic_quda.h"
+#endif
+
 #define TOLERANCE (0.0001)
 #define MAXERRCOUNT 100
 /**#define UNIDEBUG**/
@@ -201,7 +205,35 @@ int reunit_su3(su3_matrix *c)
 
 } /* reunit_su3 */
 
-void reunitarize() {
+#ifdef USE_GF_GPU
+
+void reunitarize_gpu() {
+
+  initialize_quda();
+
+#ifdef GFTIME
+  double dtime, dclock();
+  dtime = -dclock();
+#endif
+
+  su3_matrix *links = create_G_from_site_quda();
+
+  qudaUnitarizeSU3(PRECISION, links, TOLERANCE);
+
+  copy_to_site_from_G_quda(links); // insert back into site
+
+  destroy_G_quda(links);
+
+#ifdef GFTIME
+  dtime += dclock();
+  node0_printf("REUNITARIZE: time = %e\n", dtime);
+#endif
+
+}  /* reunitarize2 */
+
+#endif
+
+void reunitarize_cpu() {
   register su3_matrix *mat;
   register int i,dir;
   register site *s;
@@ -210,7 +242,7 @@ void reunitarize() {
 
   max_deviation = 0.;
   av_deviation = 0.;
-  
+
   FORALLSITES_OMP(i,s,private(dir,mat,errors) reduction(+:errcount) ){
 #ifdef SCHROED_FUN
   for(dir=XUP; dir<=TUP; dir++ ) if(dir==TUP || s->t>0 ){
@@ -248,3 +280,21 @@ void reunitarize() {
 
 }  /* reunitarize2 */
 
+void reunitarize() {
+
+#ifdef USE_GF_GPU
+
+  /* Use QUDA if gauge-force is enabled for GPU, but fallback to CPU
+     if Schroedinger functional boundary conditions are enabled */
+#ifdef SCHROED_FUN
+  node0_printf("%s not supported on GPU, using CPU fallback\n", __func__);
+  reunitarize_cpu();
+#else
+  reunitarize_gpu();
+#endif
+
+#else
+  reunitarize_cpu();
+#endif
+
+}
diff --git a/generic_ks/Make_template b/generic_ks/Make_template
index 4806f1f56..7cba5a2c4 100644
--- a/generic_ks/Make_template
+++ b/generic_ks/Make_template
@@ -74,7 +74,6 @@ G_KS_ALL = \
   fermion_links_fn_load_milc.o \
   fermion_links_fn_utilities_gpu.o \
   fermion_links_hisq_milc.o \
-  fermion_links_hisq_load_gpu.o \
   fermion_links_hisq_load_milc.o \
   fermion_links_hisq_qop.o \
   fermion_links_hyp.o \
@@ -324,8 +323,6 @@ fermion_links_fn_load_milc.o: ../generic_ks/fermion_links_fn_load_milc.c
 	${CC} -c ${CFLAGS} $<
 fermion_links_hisq_milc.o: ../generic_ks/fermion_links_hisq_milc.c
 	${CC} -c ${CFLAGS} $<
-fermion_links_hisq_load_gpu.o: ../generic_ks/fermion_links_hisq_load_gpu.c
-	${CC} -c ${CFLAGS} $<
 fermion_links_hisq_load_milc.o: ../generic_ks/fermion_links_hisq_load_milc.c
 	${CC} -c ${CFLAGS} $<
 fermion_links_hisq_qdp.o: ../generic_ks/fermion_links_hisq_qdp.c
diff --git a/generic_ks/d_congrad5_fn_gpu.c b/generic_ks/d_congrad5_fn_gpu.c
index 5154147a8..4c9666dbb 100644
--- a/generic_ks/d_congrad5_fn_gpu.c
+++ b/generic_ks/d_congrad5_fn_gpu.c
@@ -112,12 +112,11 @@ int ks_congrad_parity_gpu(su3_vector *t_src, su3_vector *t_dest,
   int num_iters;
 
   // for newer versions of QUDA we need to invalidate the gauge field if the links are new
-  static imp_ferm_links_t *fn_last = NULL;
-  if ( fn != fn_last || fresh_fn_links(fn) ){
+  if ( fn != get_fn_last() || fresh_fn_links(fn) ){
     cancel_quda_notification(fn);
-    fn_last = fn;
+    set_fn_last(fn);
     num_iters = -1;
-    node0_printf("%s: fn, notify: Signal QUDA to refresh links", myname);
+    node0_printf("%s: fn, notify: Signal QUDA to refresh links\n", myname);
   }
 
   qudaInvert(PRECISION,
diff --git a/generic_ks/dslash_fn.c b/generic_ks/dslash_fn.c
index 180ef0cfd..f231d5bad 100644
--- a/generic_ks/dslash_fn.c
+++ b/generic_ks/dslash_fn.c
@@ -299,14 +299,57 @@ void dslash_fn_site_special( field_offset src, field_offset dest,
       
 }
 
+#ifdef USE_CG_GPU
+#include "../include/generic_quda.h"
+
+// if using QUDA then we offload the dslash to the GPU
 void dslash_fn_field( su3_vector *src, su3_vector *dest, int parity,
 		      fn_links_t *fn) {
+
+  su3_matrix* fatlink = get_fatlinks(fn);
+  su3_matrix* longlink = get_lnglinks(fn);
+
+  // for newer versions of QUDA we need to invalidate the gauge field if the links are new
+  int num_iters;
+  if (fn != get_fn_last() || fresh_fn_links(fn)){
+    cancel_quda_notification(fn);
+    set_fn_last(fn);
+    num_iters = -1;
+    node0_printf("%s: fn, notify: Signal QUDA to refresh links\n", __func__);
+  }
+
+  QudaInvertArgs_t inv_args;
+  if (parity != EVENANDODD) {
+    switch(parity) {
+    case EVEN: inv_args.evenodd = QUDA_EVEN_PARITY; break;
+    case ODD:  inv_args.evenodd = QUDA_ODD_PARITY; break;
+    default: printf("%s: Unrecognised parity\n",__func__); terminate(2);
+    }
+
+    qudaDslash(PRECISION, PRECISION, inv_args, fatlink, longlink, u0, src, dest, &num_iters);
+  } else { // do both parities as separate calls
+    inv_args.evenodd = QUDA_EVEN_PARITY;
+    qudaDslash(PRECISION, PRECISION, inv_args, fatlink, longlink, u0, src, dest, &num_iters);
+    inv_args.evenodd = QUDA_ODD_PARITY;
+    qudaDslash(PRECISION, PRECISION, inv_args, fatlink, longlink, u0, src, dest, &num_iters);
+  }
+
+}
+
+#else
+
+void dslash_fn_field( su3_vector *src, su3_vector *dest, int parity,
+		      fn_links_t *fn) {
+
   msg_tag *tag[16];
-    
-   dslash_fn_field_special(src, dest, parity, tag, 1, fn);
-   cleanup_one_gather_set(tag);
+
+  dslash_fn_field_special(src, dest, parity, tag, 1, fn);
+  cleanup_one_gather_set(tag);
+
 }
 
+#endif
+
 /* Special dslash for use by congrad.  Uses restart_gather_field() when
   possible. Next to last argument is an array of message tags, to be set
   if this is the first use, otherwise reused. If start=1,use
@@ -532,7 +575,7 @@ dslash_fn_dir(su3_vector *src, su3_vector *dest, int parity,
 {
   register int i ;
   site *s;
-  msg_tag *tag[2];
+  msg_tag *tag[2] = {NULL, NULL};
   su3_matrix *fat = get_fatlinks(fn);
   su3_matrix *lng = get_lnglinks(fn);
   su3_vector tmp;
diff --git a/generic_ks/fermion_force_asqtad_gpu.c b/generic_ks/fermion_force_asqtad_gpu.c
index 8fbb92d7e..ce7a0c3ff 100644
--- a/generic_ks/fermion_force_asqtad_gpu.c
+++ b/generic_ks/fermion_force_asqtad_gpu.c
@@ -31,11 +31,8 @@ fermion_force_oprod_site(Real eps, Real weight1, Real weight2,
   msg_tag* mtag[2];
   
   { // copy the quark-field information to su3_vector fields
-    v[0] = (su3_vector*)malloc(sites_on_node*sizeof(su3_vector));
-    v[1] = (su3_vector*)malloc(sites_on_node*sizeof(su3_vector));
-
-    if(v[0] == NULL) printf("fermion_force_oprod_site: v[0] not allocated\n");
-    if(v[1] == NULL) printf("fermion_force_oprod_site: v[1] not allocated\n");  
+    v[0] = (su3_vector*)qudaAllocatePinned(sites_on_node*sizeof(su3_vector));
+    v[1] = (su3_vector*)qudaAllocatePinned(sites_on_node*sizeof(su3_vector));
 
     FORALLSITES(i,s){
       v[0][i] = *(su3_vector*)F_PT(s,x1_off);
@@ -67,8 +64,8 @@ fermion_force_oprod_site(Real eps, Real weight1, Real weight2,
   free(combined_coeff);
 
   // Cleanup
-  free(v[0]);
-  free(v[1]);
+  qudaFreePinned(v[0]);
+  qudaFreePinned(v[1]);
 }     
 
 void 
diff --git a/generic_ks/fermion_links_fn_load_gpu.c b/generic_ks/fermion_links_fn_load_gpu.c
index b45f549a6..b98665c43 100644
--- a/generic_ks/fermion_links_fn_load_gpu.c
+++ b/generic_ks/fermion_links_fn_load_gpu.c
@@ -7,13 +7,12 @@
 /* Entry points 
 
    load_fatlinks_gpu
-
+   load_fatlonglinks_gpu
+   load_hisq_aux_links_gpu
 */
 
 #include "generic_ks_includes.h"
 #include "../include/info.h"
-
-#include <quda_milc_interface.h>
 #include "../include/generic_quda.h"
 
 void  
@@ -31,16 +30,24 @@ load_fatlinks_gpu(info_t *info, su3_matrix *fat, ks_component_paths *p, su3_matr
   QudaFatLinkArgs_t fatlink_args;
   fatlink_args.su3_source = 0; // Cannot guarantee that the incoming field is an SU(3) gauge-field 
 			       // Need a workaround for this
-  fatlink_args.use_pinned_memory = 0;
  
   initialize_quda();
 
   qudaLoadKSLink(PRECISION, fatlink_args, path_coeff, links, fat, NULL);
-  return;
+
+  /* Fatlinks */
+  info->final_flop = 61632.*volume/numnodes();
+  if( p->act_path_coeff.three_staple == 0.0 &&
+      p->act_path_coeff.lepage == 0.0 &&
+      p->act_path_coeff.five_staple == 0.0)
+    info->final_flop = 72.*volume/numnodes();
+  /* Longlinks */
+  info->final_flop += 1728.*volume/numnodes();
 }
 
 void
-load_fatlonglinks_gpu(info_t *info, su3_matrix *fatlinks, su3_matrix *longlinks, ks_component_paths *p, su3_matrix *links)
+load_fatlonglinks_gpu(info_t *info, su3_matrix *fatlinks, su3_matrix *longlinks,
+		      ks_component_paths *p, su3_matrix *links)
 {
   double path_coeff[6];
   path_coeff[0] = p->act_path_coeff.one_link;
@@ -53,13 +60,66 @@ load_fatlonglinks_gpu(info_t *info, su3_matrix *fatlinks, su3_matrix *longlinks,
   QudaFatLinkArgs_t fatlink_args;
   fatlink_args.su3_source = 0; // Cannot guarantee that the incoming field is an SU(3) gauge-field
   // Need a workaround for this
-  fatlink_args.use_pinned_memory = 0;
  
   initialize_quda();
 
   // qudaLoadUnitarizedLink(PRECISION, fatlink_args, path_coeff, links, fatlinks, longlinks, NULL);
   qudaLoadKSLink(PRECISION, fatlink_args, path_coeff, links, fatlinks, longlinks);
 
+  /* Fatlinks */
+  info->final_flop = 61632.*volume/numnodes();
+  if( p->act_path_coeff.three_staple == 0.0 &&
+      p->act_path_coeff.lepage == 0.0 &&
+      p->act_path_coeff.five_staple == 0.0)
+    info->final_flop = 72.*volume/numnodes();
+  /* Longlinks */
+  info->final_flop += 1728.*volume/numnodes();
+}
+
+void
+load_hisq_aux_links_gpu(info_t *info, ks_action_paths_hisq *ap,
+			hisq_auxiliary_t *aux, su3_matrix *links)
+{
+  char myname[] = "load_hisq_aux_links_gpu";
+
+  if(ap == NULL){
+    printf("%s(%d): KS action paths not initialized\n", myname, this_node);
+  }
+
+  // load U links (is this really necessary since we have extracted "links" already?)
+  memcpy(aux->U_link, links, 4*sizeof(su3_matrix)*sites_on_node);
+
+  double path_coeff[6];
+  path_coeff[0] = ap->p1.act_path_coeff.one_link;
+  path_coeff[1] = ap->p1.act_path_coeff.naik;
+  path_coeff[2] = ap->p1.act_path_coeff.three_staple;
+  path_coeff[3] = ap->p1.act_path_coeff.five_staple;
+  path_coeff[4] = ap->p1.act_path_coeff.seven_staple;
+  path_coeff[5] = ap->p1.act_path_coeff.lepage;
+
+  QudaFatLinkArgs_t fatlink_args;
+  fatlink_args.su3_source = 1; // Is the incoming field an SU(3) gauge field?
+			       // If so, run SU(3) optimized QUDA code.
+
+  initialize_quda();
+
+  // Right now, if aux->V_link == NULL
+  // the level1 fat link is not copied from the GPU back to the CPU.
+  qudaLoadUnitarizedLink(PRECISION, fatlink_args, path_coeff, aux->U_link, aux->V_link, aux->W_unitlink);
+
+  /*
+    The above equates to
+    - load_V_from_U: 61632 flops
+    - load_Y_from_V: (as CPU code: presently not counted)
+    - load_W_from_Y: (as CPU code: presently not counted)
+  */
+  info->final_flop = 61632.*volume/numnodes();
+  if( ap->p1.act_path_coeff.three_staple == 0.0 &&
+      ap->p1.act_path_coeff.lepage == 0.0 &&
+      ap->p1.act_path_coeff.five_staple == 0.0)
+    info->final_flop = 72.*volume/numnodes();
+
+  return;
 }
 
 /* fermion_links_fn_load_gpu.c */
diff --git a/generic_ks/fermion_links_fn_load_milc.c b/generic_ks/fermion_links_fn_load_milc.c
index 2603153ce..c1c5b5c27 100644
--- a/generic_ks/fermion_links_fn_load_milc.c
+++ b/generic_ks/fermion_links_fn_load_milc.c
@@ -312,16 +312,6 @@ void load_fn_links_gpu(info_t *info, fn_links_t *fn, ks_action_paths *ap,
   else
     destroy_fn_backlinks(fn);
 
-  /* Use MILC algorithm flop count until QUDA can give us one */
-  /* Fatlinks */
-  info->final_flop = 61632.*volume/numnodes();
-  if( p->act_path_coeff.three_staple == 0.0 &&
-      p->act_path_coeff.lepage == 0.0 &&
-      p->act_path_coeff.five_staple == 0.0)
-    info->final_flop = 72.*volume/numnodes();
-  /* Longlinks */
-  info->final_flop += 1728.*volume/numnodes();  /* (formerly 1804) */
-
   dtime += dclock();
   info->final_sec = dtime;
 }
diff --git a/generic_ks/fermion_links_from_site.c b/generic_ks/fermion_links_from_site.c
index 10bedaba4..f2762673f 100644
--- a/generic_ks/fermion_links_from_site.c
+++ b/generic_ks/fermion_links_from_site.c
@@ -1,17 +1,24 @@
 /******************** fermion_links_from_site.c ****************************/
 /* MIMD version 7 */
 
-/* Teporary routines until we have removed the gauge file from the
+/* Temporary routines until we have removed the gauge file from the
    site structure */
 
 #include "generic_ks_includes.h"
 #include "../include/fermion_links.h"
 
+#ifdef USE_FL_GPU
+#include "../include/generic_quda.h"
+#endif
+
 fermion_links_t *create_fermion_links_from_site(int prec, int n_naiks, double *eps_naik){
-  su3_matrix *links;
   fermion_links_t *fl;
 
-  links = create_G_from_site();
+#ifdef USE_FL_GPU
+  su3_matrix *links = create_G_from_site_quda();
+#else
+  su3_matrix *links = create_G_from_site();
+#endif
 
 #if FERM_ACTION == HISQ
   fl = create_fermion_links_hisq(prec, n_naiks, eps_naik, phases_in, links);
@@ -21,16 +28,23 @@ fermion_links_t *create_fermion_links_from_site(int prec, int n_naiks, double *e
   fl = create_fermion_links(prec, phases_in, links);
 #endif
 
+#ifdef USE_FL_GPU
+  destroy_G_quda(links);
+#else
   free(links);
+#endif
   return fl;
 }
 
 void restore_fermion_links_from_site(fermion_links_t *fl, int prec){
-  su3_matrix *links;
 
   if(valid_fermion_links(fl, prec))return;
 
-  links = create_G_from_site();
+#ifdef USE_FL_GPU
+  su3_matrix *links = create_G_from_site_quda();
+#else
+  su3_matrix *links = create_G_from_site();
+#endif
 
 #if FERM_ACTION == HISQ
   restore_fermion_links_hisq(fl, prec, phases_in, links);
@@ -40,5 +54,9 @@ void restore_fermion_links_from_site(fermion_links_t *fl, int prec){
   restore_fermion_links(fl, prec, phases_in, links);
 #endif
 
+#ifdef USE_FL_GPU
+  destroy_G_quda(links);
+#else
   free(links);
+#endif
 }
diff --git a/generic_ks/fermion_links_hisq_load_gpu.c b/generic_ks/fermion_links_hisq_load_gpu.c
deleted file mode 100644
index 5c918ac03..000000000
--- a/generic_ks/fermion_links_hisq_load_gpu.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/**************** fermion_links_hisq_load_gpu.c **********************/
-/* MILC Version 7 */
-
-/* Foley 2012 */
-
-/* Entry points 
-
-   load_hisq_aux_links_gpu
-
-*/
-
-#include "generic_ks_includes.h"
-#include "../include/info.h"
-#include <string.h>
-
-#include <quda_milc_interface.h>
-#include "../include/generic_quda.h"
-
-void 
-load_hisq_aux_links_gpu(info_t *info, ks_action_paths_hisq *ap, 
-			hisq_auxiliary_t *aux, su3_matrix *links)
-{
-  char myname[] = "load_hisq_aux_links_gpu";
-  
-  if(ap == NULL){
-    printf("%s(%d): KS action paths not initialized\n", myname, this_node);
-  }
-
-  // load U links
-  memcpy(aux->U_link, links, 4*sizeof(su3_matrix)*sites_on_node);
-  
-
-  double path_coeff[6];
-  path_coeff[0] = ap->p1.act_path_coeff.one_link;
-  path_coeff[1] = ap->p1.act_path_coeff.naik;
-  path_coeff[2] = ap->p1.act_path_coeff.three_staple;
-  path_coeff[3] = ap->p1.act_path_coeff.five_staple;
-  path_coeff[4] = ap->p1.act_path_coeff.seven_staple;
-  path_coeff[5] = ap->p1.act_path_coeff.lepage;
-
-  QudaFatLinkArgs_t fatlink_args;
-  fatlink_args.su3_source = 1; // Is the incoming field an SU(3) gauge field? 
-			       // If so, run SU(3) optimized QUDA code.
-  fatlink_args.use_pinned_memory = 0; // Use page-locked memory in QUDA?
-
-  initialize_quda();
-
-  // Right now, if aux->V_link == NULL 
-  // the level1 fat link is not copied from the GPU back to the CPU.
-  qudaLoadUnitarizedLink(PRECISION, fatlink_args, path_coeff, aux->U_link, aux->V_link, aux->W_unitlink);
-
-  return;
-}
-
-/* fermion_links_hisq_load_gpu.c */
diff --git a/generic_ks/fermion_links_hisq_load_milc.c b/generic_ks/fermion_links_hisq_load_milc.c
index 809ab709d..449ff8a26 100644
--- a/generic_ks/fermion_links_hisq_load_milc.c
+++ b/generic_ks/fermion_links_hisq_load_milc.c
@@ -21,6 +21,10 @@
 #ifdef QCDOC
 #define special_alloc qcdoc_alloc
 #define special_free qfree
+#elif defined(USE_FL_GPU)
+#include "../include/generic_quda.h"
+#define special_alloc qudaAllocatePinned
+#define special_free qudaFreePinned
 #else
 #define special_alloc malloc
 #define special_free free
@@ -512,6 +516,7 @@ load_X_from_W(info_t *info, fn_links_t *fn, hisq_auxiliary_t *aux,
   double dtime = -dclock();
 #ifdef USE_FL_GPU
   load_fatlonglinks_gpu(info, fat, lng, ap, aux->W_unitlink);
+  final_flop += info->final_flop;
 #else
   load_fatlinks(info, fat, ap, aux->W_unitlink );
   final_flop += info->final_flop;
@@ -784,6 +789,7 @@ create_hisq_links_milc(info_t *info, fn_links_t **fn, fn_links_t **fn_deps,
 
   dtime += dclock();
   info->final_sec = dtime;
+  info->final_flop = final_flop;
 }
 
 void
diff --git a/generic_ks/fn_links_milc.c b/generic_ks/fn_links_milc.c
index 0aaa4fcfc..212719098 100644
--- a/generic_ks/fn_links_milc.c
+++ b/generic_ks/fn_links_milc.c
@@ -10,6 +10,10 @@
 #ifdef QCDOC
 #define special_alloc qcdoc_alloc
 #define special_free qfree
+#elif defined(USE_FL_GPU)
+#include "../include/generic_quda.h"
+#define special_alloc qudaAllocatePinned
+#define special_free qudaFreePinned
 #else
 #define special_alloc malloc
 #define special_free free
diff --git a/generic_ks/ks_multicg_offset_gpu.c b/generic_ks/ks_multicg_offset_gpu.c
index 63f583a72..d2f3d030b 100644
--- a/generic_ks/ks_multicg_offset_gpu.c
+++ b/generic_ks/ks_multicg_offset_gpu.c
@@ -22,6 +22,19 @@
 static const char *prec_label[2] = {"F", "D"};
 #endif
 
+// this is used to store the most recent fermion link field passed to QUDA
+static imp_ferm_links_t *fn_last = NULL;
+
+// return the most recent fermion link field passed to QUDA
+imp_ferm_links_t* get_fn_last() {
+  return fn_last;
+}
+
+// update the fermion link field passed to QUDA
+void set_fn_last(imp_ferm_links_t *fn_last_new) {
+  fn_last = fn_last_new;
+}
+
 int ks_multicg_offset_field_gpu(
     su3_vector *src,
     su3_vector **psim,
@@ -150,18 +163,18 @@ int ks_multicg_offset_field_gpu(
 
   // for newer versions of QUDA we need to invalidate the gauge field if the naik term changes to prevent caching
   static imp_ferm_links_t *fn_last = NULL;
-  if ( fn != fn_last || fresh_fn_links(fn) ){
+  if ( fn != get_fn_last() || fresh_fn_links(fn) ){
     cancel_quda_notification(fn);
-    fn_last = fn;
+    set_fn_last(fn);
     num_iters = -1;
-    node0_printf("%s: fn, notify: Signal QUDA to refresh links", myname);
+    node0_printf("%s: fn, notify: Signal QUDA to refresh links\n", myname);
   }
 
   static int naik_term_epsilon_index = -1; 
   if ( naik_term_epsilon_index != ksp[0].naik_term_epsilon_index) {
     num_iters = -1; // temporary back door hack to invalidate gauge fields since naik index has changed
     naik_term_epsilon_index = ksp[0].naik_term_epsilon_index;
-    node0_printf("%s: naik_epsilon: Signal QUDA to refresh links", myname);
+    node0_printf("%s: naik_epsilon: Signal QUDA to refresh links\n", myname);
   }
 
   qudaMultishiftInvert(
diff --git a/include/generic_quda.h b/include/generic_quda.h
index 1095dc5cc..3d26c2e19 100644
--- a/include/generic_quda.h
+++ b/include/generic_quda.h
@@ -5,9 +5,133 @@
 */
 
 #include <quda_milc_interface.h>
+#include "../include/openmp_defs.h"
 
 #ifdef HAVE_QUDA
 int initialize_quda(void);
 #endif
 
+#ifdef USE_FAST_X86_COPY
+#include <x86intrin.h>
+
+static inline void *__movsb(void *d, const void *s, size_t n) {
+  __asm volatile ("rep movsb"
+		: "=D" (d),
+		  "=S" (s),
+		  "=c" (n)
+		: "0" (d),
+		  "1" (s),
+		  "2" (n)
+		: "memory");
+  return d;
+}
+
+static inline void fast_copy(void *dest, const void *src, size_t n) {
+  __movsb(dest, src, n);
+}
+
+#else
+
+#include <string.h>
+
+static inline void fast_copy(void *dest, const void *src, size_t n) {
+  memcpy(dest, src, n);
+}
+
+#endif
+
+/*
+  Allocate a pinned gauge-field array suitable for DMA transfer to the GPU
+ */
+static su3_matrix* create_G_quda(void) {
+  return (su3_matrix*)qudaAllocatePinned(sites_on_node*4*sizeof(su3_matrix));
+}
+
+/*
+  Extract the gauge field elements into a pinned array suitable for DMA transfer to the GPU
+ */
+static su3_matrix* create_G_from_site_quda(void) {
+  su3_matrix *links = create_G_quda();
+  int i;
+  site *s;
+
+  FORALLSITES_OMP(i,s,){
+    fast_copy(links+4*i, s->link, 4*sizeof(su3_matrix));
+  } END_LOOP_OMP
+
+  return links;
+}
+
+/*
+  Copy the momentum field elements into the site struct array
+ */
+static void copy_to_site_from_G_quda(su3_matrix *links) {
+  int i;
+  site *s;
+
+  FORALLSITES_OMP(i,s,){
+    fast_copy(s->link, links+4*i, 4*sizeof(su3_matrix));
+  } END_LOOP_OMP
+}
+
+/*
+  Free the pinned gauge-field array
+ */
+static void destroy_G_quda(su3_matrix *links) {
+  qudaFreePinned(links);
+}
+
+/*
+  Allocate a pinned momentum-field array suitable for DMA transfer to the GPU
+ */
+static anti_hermitmat* create_M_quda(void) {
+  return (anti_hermitmat*)qudaAllocatePinned(sites_on_node*4*sizeof(anti_hermitmat));
+}
+
+/*
+  Extract the momentum field elements into a pinned array suitable for DMA transfer to the GPU
+ */
+static anti_hermitmat* create_M_from_site_quda(void) {
+  anti_hermitmat* momentum = create_M_quda();
+  int i;
+  site *s;
+
+  FORALLSITES_OMP(i,s,){
+    fast_copy(momentum+4*i, s->mom, 4*sizeof(anti_hermitmat));
+  } END_LOOP_OMP
+
+  return momentum;
+}
+
+/*
+  Copy the momentum field elements into the site struct array
+ */
+static void copy_to_site_from_M_quda(anti_hermitmat *momentum) {
+  int i;
+  site *s;
+
+  FORALLSITES_OMP(i,s,){
+    fast_copy(s->mom, momentum+4*i, 4*sizeof(anti_hermitmat));
+  } END_LOOP_OMP
+}
+
+/*
+  Free the pinned gauge-field array
+ */
+static void destroy_M_quda(anti_hermitmat *momentum) {
+  qudaFreePinned(momentum);
+}
+
+/*
+  Return the most recent fermion link field passed to QUDA
+  (defined in generic_ks/ks_multicg_offset_gpu.c)
+*/
+imp_ferm_links_t* get_fn_last();
+
+/*
+  Update the fermion link field passed to QUDA
+  (defined in generic_ks/ks_multicg_offset_gpu.c)
+*/
+void set_fn_last(imp_ferm_links_t *fn_last_new);
+
 #endif /* GENERIC_QUDA_H */
diff --git a/ks_imp_rhmc/update_u.c b/ks_imp_rhmc/update_u.c
index 02e91d41f..a117eb673 100644
--- a/ks_imp_rhmc/update_u.c
+++ b/ks_imp_rhmc/update_u.c
@@ -22,10 +22,6 @@
 
 void update_u(Real eps){
 
-  int i,dir;
-  site *s;
-  int j;
-
 #ifdef FN
   invalidate_fermion_links(fn_links);
 #endif
@@ -37,32 +33,15 @@ void update_u(Real eps){
   dtime = -dclock();
 #endif
 
-  anti_hermitmat *momentum = qudaAllocatePinned(sites_on_node*4*sizeof(anti_hermitmat));
-  su3_matrix *gauge = qudaAllocatePinned(sites_on_node*4*sizeof(su3_matrix));
-
-  // Populate gauge and momentum fields
-  FORALLSITES_OMP(i,s,private(dir)){
-    for(dir=XUP; dir<=TUP; ++dir) {
-      gauge[4*i + dir] = s->link[dir];
-    } // dir
-    for(dir=XUP; dir<=TUP; ++dir) {
-      momentum[4*i + dir] = s->mom[dir];
-    } // dir
-  } END_LOOP_OMP
+  su3_matrix *links = create_G_from_site_quda();
+  anti_hermitmat* momentum = create_M_from_site_quda();
 
-  qudaUpdateU(PRECISION, eps, momentum, gauge);
+  qudaUpdateU(PRECISION, eps, momentum, links);
 
-  // Copy updated gauge field back to site structure
-  FORALLSITES_OMP(i,s,private(dir)){
-    for(dir=XUP; dir<=TUP; ++dir){
-      for(j=0; j<18; ++j){
-	s->link[dir] = gauge[4*i + dir];
-      }
-    }
-  } END_LOOP_OMP
+  copy_to_site_from_G_quda(links); // insert back into site
 
-  qudaFreePinned(momentum);
-  qudaFreePinned(gauge);
+  destroy_G_quda(links);
+  destroy_M_quda(momentum);
 
 #ifdef GFTIME
   dtime += dclock();